Non-Coding RNAs Improve the Predictive Power of Network Medicine

Author

Deisy Morselli Gysi and Albert-László Barabási

Datasets

Protein-Protein Interactome (PPI)

The human Protein-Protein Interactome was derived from 21 public databases containing different types of experimentally-derived PPI data, as described in the manuscript.

Protein-Protein & Non-Coding Interactome (PPI & NCI)

The PPI & NCI is combined from nine publicly available databases that collect and curate experimentally-derived non-coding interactions: 1) miRNA-targets, derived from reporter assay, western blot, microarray, and next-generation sequencing experiments from mirTARbase; 2) miRNAs and lncRNAs interactions validated using CLIP-seq, AGO CLIP-seq, ChIRP-seq, HITS-CLIP and PAR-CLIP from lncBook, NPinter4, DIANA Tools, RAIN, and lncRNome; 3) RNA-RNA interactions validated from transcriptome-wide sequencing-based experiments (PARIS, SPLASH, LIGRseq, and MARIO), and targeted studies (RIAseq, RAP-RNA, and CLASH) from RISE; 4) Literature curated from miRNet and miRecords.

Gene Disease Association (GDA)

Data was combined from multiple sources, as described in the manuscript, such as GWAS from ClinGen, ClinVar, CTD, Disease Enhancer, DisGeNET, GWAS Catalog, HMDD45, lncBook, LncRNA disease, LOVD, Monarch, OMIM, Orphanet, PheGenI, and PsyGeNet.

Code For Reproducing Paper Results

############################## 
# Install packages, if needed.
# Load needed packages
# Define custom needed functions
##############################

RequiredPackages <- c(
  ## Viz
  "gghalves",
  "ggplot2",
  "patchwork",
  "showtext",
  "ggrepel",
  "scales",
  "plotly",
  "UpSetR",
  "Cairo",
  
  ## Data Manipulation
  "magrittr",
  "data.table",
  "tidyverse",
  "tidyr",
  "dplyr",
  "janitor",
  
  ## Stats
  "poweRlaw", 
  
  ## NetSci
  "NetSci",
  "igraph",
  "CoDiNA",
  
  ## Others
  "progress",
  "stringr",
  "knitr",
  "parallel" 
)


for (i in RequiredPackages) { 
  if (!require(i, character.only = TRUE)) install.packages(i)
}

`%ni%` <- Negate(`%in%`)

############################## 
# Define Colors
##############################
values =  c("Protein Coding" = "#ef476f", 
            "TF" = "#f78c6b" ,
            "Pseudogene" = "#06d6a0",
            "miRNA" = "#83d483",
            "other" = "#ffd166",
            "lncRNA" = "#0cb0a9", 
            "ncRNA" = "#118ab2")

PPI_colors = c(PPI = "#8D3B72",
               `PPI & NCI` = "#72E1D1")

############################## 
# Define Functions
##############################

############################## 
# Order Alfabetically 
##############################
OrderNames <- function (M) {
  M[,1] %<>% as.character()
  M[,2] %<>% as.character()
  n1 = apply(M[, 1:2], 1, min)
  n2 = apply(M[, 1:2], 1, max)
  M[,1] <- n1
  M[,2] <- n2
  
  return(M)
}

############################## 
# Plot Disease Modules
##############################
plot_diseases = function(disease){
  disease_genes = GDA %>%
    filter(DiseaseName == disease) %>%
    pull(hgnc_symbol) %>%
    unique()
  V(gPPI)$frame.color = NA; 
  V(gPPInc)$frame.color = NA
  
  g_disease_PPI = gPPI %>%
    induced_subgraph(vids = 
                       disease_genes[disease_genes %in%
                                       V(.)$name])
  
  g_disease_PPINC = gPPInc %>%
    induced_subgraph(vids =
                       disease_genes[disease_genes %in%
                                       V(.)$name])
  
  in_lcc_ncippi =  g_disease_PPINC %>%
    extract_LCC() %>%
    V() %>%
    unclass() %>%
    names
  
  in_lcc_ppi =  g_disease_PPI %>%
    extract_LCC() %>%
    V() %>%
    unclass() %>%
    names
  
  
  V(g_disease_PPI)$degree = igraph::degree(g_disease_PPI)
  V(g_disease_PPINC)$degree = igraph::degree(g_disease_PPINC)
  
  
  
  V(g_disease_PPI)$size =
    igraph::degree(g_disease_PPI) %>%
    CoDiNA::normalize() + 0.01
  V(g_disease_PPINC)$size = 
    igraph::degree(g_disease_PPINC) %>%
    CoDiNA::normalize() + 0.01
  
  V(g_disease_PPI)$size %<>% ifelse(is.na(.), 0.5,.)
  
  
  
  E(g_disease_PPI)$width =  E(g_disease_PPI)$weight =
    edge.betweenness(g_disease_PPI, directed = F) %>%
    CoDiNA::normalize()
  
  E(g_disease_PPINC)$width = E(g_disease_PPINC)$weight =
    edge.betweenness(g_disease_PPINC, directed = F) %>%
    CoDiNA::normalize()
  
  E(g_disease_PPINC)$weight = (E(g_disease_PPINC)$weight) + 0.01
  E(g_disease_PPI)$weight = E(g_disease_PPI)$weight+ 0.01
  
  
  #
  # fix colors
  #
  
  V(g_disease_PPI)$color = 
    ifelse(V(g_disease_PPI)$name %in% in_lcc_ppi,
           V(g_disease_PPI)$color, "gray75")
  V(g_disease_PPI)$frame.color =
    V(g_disease_PPI)$color
  ###
  ###
  V(g_disease_PPINC)$color = 
    ifelse(V(g_disease_PPINC)$name %in% in_lcc_ncippi,
           V(g_disease_PPINC)$color, "gray75")
  V(g_disease_PPINC)$frame.color = 
    V(g_disease_PPINC)$color
  
  V(g_disease_PPI)$label = 
    ifelse(V(g_disease_PPI)$size >
             quantile(V(g_disease_PPI)$size, 0.99),
           V(g_disease_PPI)$name, NA)
  
  
  V(g_disease_PPINC)$label = 
    ifelse(V(g_disease_PPINC)$size > 
             quantile(V(g_disease_PPINC)$size, 0.99), 
           V(g_disease_PPINC)$name, NA)
  
  
  
  return(list(PPI = g_disease_PPI,
              NCIPPI = g_disease_PPINC))
}


############################## 
# Select Few Diseases as Reference
##############################
Interest = c("schizophrenia", 
             "crohn disease", 
             "arthritis rheumatoid", 
             "pre eclampsia", 
             "asthma", 
             "angina pectoris")

############################## 
# Define Parameters to
# be used in the analysis
##############################
sabp = 0.05 # S_ab pvalue
lccp = 0.05 # LCC pvalue
comop = 0.01 # RR pvalue
overlapp = 0.01 # Hypergeometric-test p-value
############################## 
# Load the Data
##############################
PPI = fread("../data/input/PPI.csv") ## PPI
glimpse(PPI)
Rows: 536,965
Columns: 3
$ HGNC_Symbol.1 <chr> "A1BG", "A1BG", "A1BG", "A1BG", "A1BG", "A1BG", "A1BG", …
$ HGNC_Symbol.2 <chr> "ABCC6", "ANXA7", "CDKN1A", "CRISP3", "FDXR", "GDPD1", "…
$ Source        <chr> "HINT|PINA|APID|BioGrid|Instruct", "HINT|HIPPIE|PINA|API…
############################## 
# PPI include: 
# $ HGNC_Symbol.1: HGNC Gene Name for Node 1
# $ HGNC_Symbol.2: HGNC Gene Name for Node 2
# $ Source       : Source of the interaction, "|" separated
##############################


Annot_Dic = fread("../data/input/gene_dictionary.txt") ## Gene Dictionary
glimpse(Annot_Dic)
Rows: 43,413
Columns: 2
$ Symbol      <chr> "A1BG", "A1BG-AS1", "A1CF", "A2M", "A2M-AS1", "A2ML1", "A2…
$ Category_cl <chr> "Protein Coding", "lncRNA", "Protein Coding", "Protein Cod…
############################## 
# Annot_Dic include:
# $ Symbol       : HGNC Gene Name
# $ Category_cl  : HGNC Gene Category
# $ color        : Color Definition for Viz
##############################

## 
## Define the nodes colors based on gene category
## 

Annot_Dic$Category_cl = factor(Annot_Dic$Category_cl, 
                               levels = names(values))
Annot_Dic$color = values[unclass(Annot_Dic$Category_cl)]

NCPPI = fread("../data/input/ncPPI_PPI.csv") ## PPI & NCI
glimpse(NCPPI)
Rows: 1,114,777
Columns: 3
$ HGNC_Symbol.1 <chr> "A1BG", "A1BG", "A1BG", "A1BG", "A1BG", "A1BG", "A1BG", …
$ HGNC_Symbol.2 <chr> "ABCC6", "ANXA7", "CDKN1A", "CRISP3", "FDXR", "GDPD1", "…
$ Source        <chr> "HINT|PINA|APID|BioGrid|Instruct", "HINT|HIPPIE|PINA|API…
############################## 
# NCPPI include: 
# $ HGNC_Symbol.1: HGNC Gene Name for Node 1
# $ HGNC_Symbol.2: HGNC Gene Name for Node 2
# $ Source       : Source of the interaction, "|" separated
##############################

GDA = fread("../data/input/GDA.csv") ## Gene Disease Association
glimpse(GDA)
Rows: 119,595
Columns: 8
$ DiseaseName    <chr> "hepatomegaly", "schizophrenia", "acute kidney injury",…
$ MESHID         <chr> "D006529", "D012559", "D058186", "D058186", "D018248", …
$ hgnc_symbol    <chr> "A1BG", "A1BG", "IGHA2", "A2M", "IGHA2", "A2M", "IGHA2"…
$ evidence       <chr> "CTD|DisGeNET_GDA", "CTD|DisGeNET_GDA", "CTD|DisGeNET_G…
$ TreeNumber     <chr> "C06|C23", "F03", "C13|C12", "C13|C12", "C06|C04", "C06…
$ DescriptorName <chr> "Digestive System Diseases|Pathological Conditions, Sig…
$ Total_Genes    <int> 64, 3471, 169, 169, 16, 16, 1072, 1072, 1251, 1251, 428…
$ Category_cl    <chr> "Protein Coding", "Protein Coding", "other", "Protein C…
############################## 
# GDA include: 
# DiseaseName    : Disease Name in MeSH Terms
# MESHID         : MESH ID of the disease
# hgnc_symbol    : HGNC Gene Name 
# evidence       : Sources of association
# DescriptorName : MeSH categories for the disease
# Total_Genes    : Total of Associated Genes
# Category_cl    : HGNC Gene Category
##############################
############################## 
### Create the graphs: 
### Edge list from the csv files (PPI, PPI & NCI)
### Node list from gene annotation
### Because the annotation may include genes that 
### do not exist in the graph, we delete all nodes with 
### degree = 0
##############################

### gPPI as the PPI graph
### Basic Network Analysis and
### Network caracterization


gPPI = graph_from_data_frame(PPI, directed = F, 
                             vertices = Annot_Dic) %>%
  delete.vertices(degree(.) == 0) 
V(gPPI)$degree = degree(gPPI)
gPPI
IGRAPH 5559997 UN-- 18217 536965 -- 
+ attr: name (v/c), Category_cl (v/c), color (v/c), degree (v/n),
| Source (e/c)
+ edges from 5559997 (vertex names):
 [1] A1BG--ABCC6    A1BG--ANXA7    A1BG--CDKN1A   A1BG--CRISP3   A1BG--FDXR    
 [6] A1BG--GDPD1    A1BG--GRB7     A1BG--HSP90AA1 A1BG--KCMF1    A1BG--KLK10   
[11] A1BG--LGALS3   A1BG--MATN2    A1BG--MCM10    A1BG--NRF1     A1BG--PHF11   
[16] A1BG--PRDX4    A1BG--PSME4    A1BG--RHBDD1   A1BG--RNF123   A1BG--SETD7   
[21] A1BG--SMN1     A1BG--SMN2     A1BG--SNCA     A1BG--TK1      A1BG--UBAC1   
[26] A1BG--UBE2U    A1BG--UBR4     A1BG--UBXN1    A1BG--WDR62    A1BG--ZBTB40  
[31] A1CF--APOBEC1  A1CF--CAVIN2   A1CF--CELF2    A1CF--CYP46A1  A1CF--CYSRT1  
+ ... omitted several edges
net_caracs = list()

net_caracs$PPI = data.frame(network = "PPI", 
                            diameter = diameter(gPPI), 
                            density = edge_density(gPPI, loops=TRUE), 
                            components = components(gPPI)$no, 
                transitivity = igraph::transitivity(gPPI), 
                largest_cliques = clique_num(gPPI))

x = igraph::clusters(gPPI)

net_caracs$PPI$LCC = max(x$csize)


dist = distances(NetSci::extract_LCC(gPPI))
dist[upper.tri(dist, diag = T)] <- NA
net_caracs$PPI$av_sp = mean(dist, na.rm = T)

### gPPInc as the PPI & NCI graph
gPPInc = graph_from_data_frame(NCPPI, directed = F, 
                               vertices = Annot_Dic) %>%
  delete.vertices(degree(.) == 0) 
V(gPPInc)$degree = degree(gPPInc)
gPPInc
IGRAPH fd9d53d UN-- 26575 1114777 -- 
+ attr: name (v/c), Category_cl (v/c), color (v/c), degree (v/n),
| Source (e/c)
+ edges from fd9d53d (vertex names):
 [1] A1BG--ABCC6    A1BG--ANXA7    A1BG--CDKN1A   A1BG--CRISP3   A1BG--FDXR    
 [6] A1BG--GDPD1    A1BG--GRB7     A1BG--HSP90AA1 A1BG--KCMF1    A1BG--KLK10   
[11] A1BG--LGALS3   A1BG--MATN2    A1BG--MCM10    A1BG--MIR1262  A1BG--MIR1288 
[16] A1BG--MIR141   A1BG--MIR200A  A1BG--MIR4645  A1BG--MIR4701  A1BG--MIR4772 
[21] A1BG--MIR4793  A1BG--MIR508   A1BG--MIR542   A1BG--MIR5582  A1BG--MIR6512 
[26] A1BG--MIR661   A1BG--MIR6720  A1BG--MIR6736  A1BG--MIR6742  A1BG--MIR6776 
[31] A1BG--MIR6849  A1BG--MIR766   A1BG--MIR7703  A1BG--NRF1     A1BG--PHF11   
+ ... omitted several edges
net_caracs$PPI_NCI = data.frame(network = "PPI & NCI", 
                            diameter = diameter(gPPInc), 
                            density = edge_density(gPPInc, loops=TRUE), 
                            components = components(gPPInc)$no, 
                transitivity = igraph::transitivity(gPPInc), 
                largest_cliques = clique_num(gPPInc))

x = igraph::clusters(gPPInc)

net_caracs$PPI_NCI$LCC = max(x$csize)


dist = distances(NetSci::extract_LCC(gPPInc))
dist[upper.tri(dist, diag = T)] <- NA
net_caracs$PPI_NCI$av_sp = mean(dist, na.rm = T)


net_caracs %>% 
  bind_rows() %>% 
    pivot_longer(cols = - network) %>% 
    pivot_wider(., names_from = network) %>% 
  knitr::kable()
name PPI PPI & NCI
diameter 7.000000e+00 9.000000e+00
density 3.235900e-03 3.156900e-03
components 1.300000e+01 5.200000e+01
transitivity 4.785070e-02 2.594430e-02
largest_cliques 5.700000e+01 5.700000e+01
LCC 1.820400e+04 2.648300e+04
av_sp 2.661309e+00 2.794652e+00

Table S1: Descriptive of Databases Genes and Reported Binding Experimental Interactions.

############################## 
### Build a summary table (Table S1)
### Include the number of genes and interaction type 
### in each database. 
##############################
### Because the PPI is a subset of the 
### PPI & NCI, we build the 
### table only based on the PPI & NCI.
##############################

NCI = NCPPI
n_map = Annot_Dic %>%
  select(Symbol,Category_cl)
NCI %<>% 
  dplyr::left_join(n_map, by = c("HGNC_Symbol.1"="Symbol")) %>%
  rename(Category_A = Category_cl) %>% 
  dplyr::left_join(n_map, by = c("HGNC_Symbol.2"="Symbol")) %>%
  rename(Category_B = Category_cl) %>%
  unique()

NCI %<>%
  mutate(id = 1:n())

NCI_PPI = NCI$Source %>%
  stringr::str_split(., "\\|", simplify = TRUE) %>%
  as.data.frame() %>% 
  mutate(id = 1:n()) 

NCI_PPI_l = list()



y = 223
i = nrow(NCI_PPI)/y

for (k in 1:y){
  ini = (i*(k-1) + 1)
  end = i*k
  NCI_PPI_l[[k]] = NCI_PPI[ini:end, ]%>%
    pivot_longer(-id, values_to = "source") %>%
    select(-name) %>%
    filter(source != "") %>%
    unique() %>%
    dplyr::inner_join(NCI, ., by = 'id') %>%
    select(- c(Source, id )) %>%
    unique()
}
NCI_PPI_l %<>% 
  bind_rows() %>%
  unique()


pc = c("Protein Coding", "TF")
edges_summary = NCI_PPI_l %>%
  mutate(type = ifelse(Category_A %in% pc & 
                         Category_B %in% pc, 
                       "PC <-> PC", "PC <-> NC")) %>%
  
  mutate(type = ifelse(Category_A %ni% pc & 
                         Category_B %ni% pc, 
                       "NC <-> NC", type)) %>%
  group_by(source, type) %>%
  summarise(n = n()) %>%
  pivot_wider(names_from = type, 
              values_from = n, 
              values_fill = 0)

nodes_summary = 
  NCI_PPI_l %>%
  select(-starts_with("Category")) %>% 
  pivot_longer(cols = c(HGNC_Symbol.1, HGNC_Symbol.2)) %>%
  select(-name) %>%
  unique() %>% 
  dplyr::left_join(Annot_Dic, 
                   by = c("value"="Symbol")) %>%
  mutate(type = 
           ifelse(Category_cl %in% pc, "PC", "NC")) %>%
  group_by(type, source) %>%
  summarise(n = n()) %>%
  pivot_wider(names_from = type, values_from = n)


summary_data = dplyr::inner_join(nodes_summary,
                                 edges_summary)

summary_data %>% 
  knitr::kable()
source NC PC NC <-> NC PC <-> NC PC <-> PC
APID 411 15930 32 3151 169730
BioGrid 616 15308 31 9883 194221
BioPlex 386 13547 177 3912 104600
CoFrac 19 2959 3 205 13689
DIANA 2167 10087 15189 76060 0
DIP 2 1389 1 1 1285
ENCODE 1229 13808 1412 101569 23473
HI-Union 165 8882 17 1703 61969
HINT 320 14576 66 2528 116789
HIPPIE 100 8520 17 1437 121691
InateDB 153 5548 1 251 13378
Insider 17 3362 18 12 3939
Instruct 206 11234 45 774 50456
IntAct 56 4567 10 131 9102
Interactome3d 68 7327 108 135 15071
InWeb 67 10530 43 94 58507
KinomeNetworkX 7 2340 0 25 7367
LitBM17 20 6020 1 46 13320
LNC_Book 30 NA 21 0 0
lncRNome 391 130 0 733 245
MINT 42 4871 6 121 10616
miRecords 167 651 7 967 1
miRNet 347 917 14 2532 0
miRTarBase 511 2511 51 6616 0
NPInterv4 485 538 210 860 83
PhosphoSitePlus 7 2840 0 7 7014
PINA 282 14895 12 958 163068
QUBIC 34 5271 7 241 27453
RAIN 2973 13271 7840 383213 156
RISE 5101 15559 2333 26660 35091
Signalink 152 11279 5 415 51982

Table S2: Degreee Median and interquartile ranges for each gene category in all two networks.

##############################
### Basic Stats of degree in both networks
##############################

n2 = gPPI %>% igraph::as_data_frame(., "vertices")
n3 = gPPInc %>% igraph::as_data_frame(., "vertices")

n2 %<>% 
  mutate(base = "PPI")

n3 %<>% 
  mutate(base = "PPI & NCI")

Nodes = bind_rows(n2, n3)


Nodes %>%
  group_by(Category_cl, base) %>%
  summarise(median = quantile(degree, 0.50), 
            Q1 =  quantile(degree, 0.250), 
            Q3 =  quantile(degree, 0.750)) %>%
  mutate( result = paste0(median, " [", Q1, "; ", Q3, "]")) %>%
  select(Category_cl, base, result) %>%
  pivot_wider(names_from = base, 
              values_from = result) %>% 
  knitr::kable()
`summarise()` has grouped output by 'Category_cl'. You can override using the
`.groups` argument.
Category_cl PPI & NCI PPI
lncRNA 3 [1; 6] NA
miRNA 52 [16; 224.75] NA
other 11 [3; 25] NA
Protein Coding 54 [20; 107] 30 [12; 64]
Pseudogene 2 [1; 4] NA
TF 90 [41; 168] 48 [20; 102]

Figure S1: Network Validation: Gene-Interaction Overlap

##############################
### Upset Plot of Gene-Interaction Overlap
##############################

ps1 = NCI_PPI_l %>% 
  mutate(type = ifelse(Category_A %in% pc & 
                         Category_B %in% pc, "PC <-> PC", "PC <-> NC")) %>%
  
  mutate(type = ifelse(Category_A %ni% pc & 
                         Category_B %ni% pc, "NC <-> NC", type)) %>%
  group_by(HGNC_Symbol.1, HGNC_Symbol.2, type, source) %>%
  summarise(n = n()) %>% 
  pivot_wider(names_from = source, values_from = n, values_fill = 0) %>%
  as.data.frame() %>%
  UpSetR::upset(., order.by = 'freq', 
                sets.x.label = 'Interactions (links)', 
                mainbar.y.label = 'Overlap', 
                queries = list(list(query = elements, 
                                    params = list("type", "PC <-> PC"), 
                                    color = "#8D3B72", active = T)),
                # sets.bar.color = "#72E1D1",
                main.bar.color = "#72E1D1",
                nsets = 100, 
                number.angles = 30,
                mb.ratio = c(0.5, 0.5), 
                text.scale = 1.5)
Cairo::CairoPDF("../figs/Fig_S1.pdf", width = 10, height = 10)
ps1

dev.off()
quartz_off_screen 
                2 

Figure S2: Network Validation: Gene Overlap

##############################
### Upset Plot of Gene Overlap
##############################


ps2 = NCI_PPI_l %>% 
  select(starts_with("HGNC"),source) %>%
  pivot_longer(-source) %>% 
  select(-name) %>% 
  unique() %>% 
  dplyr::left_join(., n_map, by = c('value' = "Symbol")) %>% 
  mutate(type = ifelse(Category_cl %in% pc, "PC", "NC")) %>% 
  select(-Category_cl) %>%
  group_by(value, type, source) %>% 
  summarise(n = n()) %>% 
  pivot_wider(names_from = source, 
              values_from = n, 
              values_fill = 0) %>%
  as.data.frame() %>%
  UpSetR::upset(., order.by = 'freq', 
                sets.x.label = 'Genes (nodes)', 
                mainbar.y.label = 'Overlap', 
                # sets.bar.color = "#56B4E9",
                # main.bar.color = "#56B4E9",
                queries = list(list(query = elements, 
                                    params = list("type", "PC"), 
                                    color = "#8D3B72", active = T)),
                # sets.bar.color = "#72E1D1",
                main.bar.color = "#72E1D1", 
                nsets = 100, 
                number.angles = 30,
                mb.ratio = c(0.5, 0.5), 
                text.scale = 1.5)


Cairo::CairoPDF("../figs/Fig_S2.pdf", width = 10, height = 10)
ps2

dev.off()
quartz_off_screen 
                2 

Figure S3: Degree distribution of different genomic elements.

##############################
### Degree Distribution 
##############################
gPPI %>%
  degree(.) %>% 
  data.frame(degree = ., Symbol = names(.)) %>%
  left_join(Annot_Dic) %>% 
  group_by(degree, Category_cl) %>% 
  summarise(k = n()) %>% 
  ggplot() +
  aes(x = degree, 
      y = k, 
      fill = Category_cl, 
      colour = Category_cl) +
  geom_point() +
  scale_fill_manual(values = values) +
  scale_color_manual(values = values) +
  scale_x_continuous(trans = "log10") +
  scale_y_continuous(trans = "log10") +
  labs(x = "Degree", y = "k", 
       color = "Gene Category", 
       fill = "Gene Category") +
  theme_minimal() +
  theme(legend.position = "bottom")

gPPInc %>%
  degree(.) %>% 
  data.frame(degree = ., Symbol = names(.)) %>%
  left_join(Annot_Dic) %>% 
  group_by(degree, Category_cl) %>% 
  summarise(k = n()) %>% 
  ggplot() +
  aes(x = degree, 
      y = k, 
      fill = Category_cl, 
      colour = Category_cl) +
  geom_point() +
  scale_fill_manual(values = values) +
  scale_color_manual(values = values) +
  scale_x_continuous(trans = "log10") +
  scale_y_continuous(trans = "log10") +
  labs(x = "Degree", y = "k", 
       color = "Gene Category", 
       fill = "Gene Category") +
  theme_minimal() +
  theme(legend.position = "bottom")

##############################
### DegreeDistribution as bin-log-log
### Start by defining the bins
##############################
DD_loglog = Nodes
DD_loglog %<>% 
  mutate(logdegree = log10(DD_loglog$degree)) 

calc_breaks = function(min_bin, max_bin, bins = 10 ){
  res = 10^(seq(min_bin, max_bin,length.out= bins))
  return(res)
}

bins = DD_loglog %>%
  ungroup() %>%
  group_by(base, Category_cl) %>%
  summarise(min_bin = min(logdegree),
            max_bin = max(logdegree)) %>%
  split(paste(.$base, .$Category_cl, sep = "_")) %>%
  map(~ calc_breaks(.$min_bin, .$max_bin) ) %>% 
  bind_rows() %>%
  mutate( id = 1:n()) %>%
  pivot_longer(data = ., 
               cols = -id, 
               names_sep = "_", 
               names_to = c("base", "Category_cl")) 

genetype = DD_loglog$Category_cl %>% unique()
bases = DD_loglog$base %>% unique()
res = list(); k = 0

for(i in 1:length(genetype)){
  for(j in 1:length(bases)){
    
    bs = bins %>%
      filter(Category_cl == genetype[i]) %>%
      filter(base == bases[j]) %>%
      pull(value)
    
    hi = DD_loglog %>%
      filter(Category_cl == genetype[i]) %>%
      filter(base == bases[j]) %>%
      pull(degree) 
    
    if(length(hi)> 0){
      k = k + 1
      
      hi %<>%
        hist(., breaks = bs, plot = F)
      
      res[[k]] = data.frame(counts = hi$counts, 
                            density = hi$density, 
                            mids = hi$mids, 
                            Category_cl = genetype[i],
                            base = bases[j])
    }
  }
}

res %<>% bind_rows()


res %<>% 
  group_by(Category_cl, base)%>% 
  mutate(tick = 1) %>%
  mutate(tick = cumsum(tick))


Fig_S3 = res %>%
  filter(base %ni% "NCI") %>% 
  ggplot() +
  aes(x = mids, 
      y = density, 
      colour = Category_cl) +
  geom_point(shape = "circle", size = 1.5) +
  geom_smooth(span = 0.75, se = F) +
  scale_color_manual(values = values) +
  scale_x_continuous(trans = "log10") +
  scale_y_continuous(trans = "log10") +
  theme_minimal() +
  facet_wrap(vars(base)) +
  labs (x = "Degree", 
        y = "P < k",
        color = "Element type") + 
  theme(legend.position = "bottom", 
        text = element_text(size = 18, 
                            family = "HelveticaNeue"),
        axis.title  = element_text(face = "bold",
                                   hjust = 0.5, 
                                   family = "HelveticaNeue"), 
        axis.title.x.top = element_text(face = "bold", 
                                        hjust = 0.5, 
                                        family = "HelveticaNeue"), 
        strip.text = element_text(face = "bold", 
                                  hjust = 0.5, 
                                  family = "HelveticaNeue")) 

Cairo::CairoPDF("../figs/Fig_S3.pdf", width = 10, height = 10)
Fig_S3

dev.off()
quartz_off_screen 
                2 

Figure S4: Fit a Power-Law

pk <- function(kcut, kprime, ksat, gamma){
  p_k = 1 / (sum((kprime + ksat)^-gamma * exp (-kprime/kcut))) * (kprime + ksat)^-gamma * exp (-kprime/kcut) 
  return(p_k)
}


PLD = function(g = gPPI, 
               category = "Protein Coding", 
               boot = 1000, 
               col = NULL){
  threads = parallel::detectCores()
  message("We detected ", threads, " threads. All will be used.")
  
  if(is.null(col)){
    col = values[names(values) %in% category] 
  }
  
  out = list()
  
  data.dist = igraph::as_data_frame(g, what = "vertices") %>% 
    filter(Category_cl %in% category) %>% 
    group_by(degree) %>% 
    summarise(f = n()) %>% 
    ungroup() %>%
    mutate(p_k = f/sum(f)) %>% 
    mutate(cump_k = cumsum(p_k)) %>% 
    select(k = degree, p_k, cump_k) %>% 
    mutate(S_k = 1 - cump_k + p_k)
  
  out$data = data.dist
  
  data = igraph::as_data_frame(g, what = "vertices") %>% 
    filter(Category_cl == category)
  
  message("Calculating k_min.")
  m_pl <- displ$new(data$degree)
  est_pl <- estimate_xmin(m_pl)
  
  out$pars$k_min  = est_pl$xmin #k_min
  out$pars$gamma  = est_pl$pars #gamma
  out$pars$gof    = est_pl$gof  #D
  
  data.s <- unique(data$degree)
  
  d_est <- data.frame(K_min=sort(data.s)[1:(length(data.s)-2)], 
                      gamma=rep(0,length(data.s)-2), 
                      D=rep(0,length(data.s)-2))
  
  for (i in d_est$K_min){
    d_est[which(d_est$K_min == i),2] <- 
      estimate_xmin(m_pl, 
                    xmins = i)$pars
    d_est[which(d_est$K_min == i),3] <- 
      estimate_xmin(m_pl, 
                    xmins = i)$gof
  }
  
  K.min_D.min <- d_est[which.min(d_est$D), 1]
  p_k_min = ggplot(data=d_est, 
                   aes(x=K_min, y=D)) +
    geom_line(color = col) +
    geom_vline(xintercept=K.min_D.min, 
               colour="gray20") +
    annotate("text", 
             x=K.min_D.min, 
             y=max(d_est$D)/3*2, 
             label=K.min_D.min) + 
    theme_minimal() +
    theme(text = element_text(size = 18),
          axis.title  = element_text(
            face = "bold", 
            hjust = 0.5), 
          axis.title.x.top =element_text(
            face = "bold", 
            hjust = 0.5), 
          strip.text = element_text(
            face = "bold", 
            hjust = 0.5))
  
  p_gamma = ggplot(data=d_est, aes(x=K_min, y=gamma)) +
    geom_line(color = col) +
    geom_vline(xintercept=K.min_D.min, colour="gray20") +
    annotate("text", 
             x=K.min_D.min, 
             y=max(d_est$gamma)/3*2, 
             label=K.min_D.min) + 
    theme_minimal() +
    theme(text = element_text(size = 18),
          axis.title  = element_text(
            face = "bold", 
            hjust = 0.5), 
          axis.title.x.top =element_text(
            face = "bold", 
            hjust = 0.5), 
          strip.text = element_text(
            face = "bold", 
            hjust = 0.5))
  
  m_pl$setXmin(est_pl)
  plot.data <- plot(m_pl, draw = F)
  fit.data <- lines(m_pl, draw = F)
  
  message("Goodness-of-fit for k_min.")
  # Goodness-of-fit
  bs_pl <- bootstrap_p(m_pl, 
                       no_of_sims=boot,
                       threads=threads,
                       seed = 123)
  
  df_bs_pl <- bs_pl$bootstraps
  
  p_cdf = ggplot(plot.data) +
    geom_point(aes(x=log10(x), y=log10(y)), 
               color = col, size = 2) +
    labs(x="log (k)", 
         y="log (CDF)", 
         subtitle = paste("Gamma = ", round(est_pl$pars,2),
                          "GOF = ", round(bs_pl$gof,2),
                          "\np = ", round(bs_pl$p,2), 
                          "k_min = ", est_pl$xmin
         )) +
    geom_line(data=fit.data,
              aes(x=log10(x), y=log10(y)),
              colour="gray20") + 
    theme_minimal() +
    theme(text = element_text(size = 18),
          axis.title  = element_text(
            face = "bold", 
            hjust = 0.5), 
          axis.title.x.top =element_text(
            face = "bold", 
            hjust = 0.5), 
          strip.text = element_text(
            face = "bold", 
            hjust = 0.5))
  
  p_gamma_boot = ggplot(data=df_bs_pl, aes(pars)) +
    geom_histogram(color = col, fill = col, alpha = 0.5) +
    labs(x="gamma", y="frequency") +
    theme_minimal() + 
    theme(text = element_text(size = 18),
          axis.title  = element_text(
            face = "bold", 
            hjust = 0.5), 
          axis.title.x.top =element_text(
            face = "bold", 
            hjust = 0.5), 
          strip.text = element_text(
            face = "bold", 
            hjust = 0.5))
  
  p_kmin_boot = ggplot(data=df_bs_pl, aes(xmin)) +
    geom_histogram(color = col, fill = col, alpha = 0.5) +
    labs(x="K_min", y="frequency") +
    theme_minimal() + 
    theme(text = element_text(size = 18),
          axis.title  = element_text(
            face = "bold", 
            hjust = 0.5), 
          axis.title.x.top =element_text(
            face = "bold", 
            hjust = 0.5), 
          strip.text = element_text(
            face = "bold", 
            hjust = 0.5))
  
  data.s <- data.dist$k
  
  message("Defining k_cut and k_sat.")
  #####
  # #generate kmin & kmax pairs
  k_mins = data.s[data.s <= min(est_pl$xmin, 10)]
  k_max = data.s[(length(data.s)-5) : length(data.s)]
  pairs <- expand.grid(k_mins, k_max)
  
  #scan D for all kmin-kmax pairs
  
  find_ks = function(i){
    require(poweRlaw)
    m_pl$setXmin(pairs[i,1])
    est = estimate_xmin(m_pl, 
                        xmin = pairs[i,1],
                        xmax = pairs[i,2],
                        distance = "ks")
    
    out = data.frame(xmin = pairs[i,1],
                     xmax = pairs[i,2],
                     D = est$gof,
                     gamma = est$pars)
  }
  
  k_scan = new.env()
  assign("m_pl", m_pl, envir = k_scan)
  assign("pairs", pairs, envir = k_scan)
  
  require(parallel)
  
  cl = parallel::makePSOCKcluster(threads)
  clusterExport(cl, "pairs", envir = k_scan)
  clusterExport(cl, "m_pl", envir = k_scan)
  pairs = clusterApplyLB(cl, 1:nrow(pairs), find_ks) %>%
    bind_rows()
  
  stopCluster(cl)
  
  
  p_detect_pairs = pairs %>% 
    mutate(xmax_cat = as.factor(xmax)) %>% 
    ggplot() +
    aes(x = xmin, 
        y = D, 
        colour = xmax_cat, 
        shape = xmax_cat) +
    geom_point(size = 1.5) +
    scale_color_hue() +
    theme_minimal() +
    labs(x = "K_sat", y = "D", color = "K_cut", shape = "K_cut") + 
    theme(legend.position = "bottom",
      text = element_text(size = 18),
          axis.title  = element_text(
            face = "bold", 
            hjust = 0.5), 
          axis.title.x.top =element_text(
            face = "bold", 
            hjust = 0.5), 
          strip.text = element_text(
            face = "bold", 
            hjust = 0.5))
  
  message("Goodness-of-fit for k_cut and k_sat.")
  bs_pl_sat_cut <- bootstrap_p(m_pl,
                               xmins = pairs[which.min(pairs$D), 1],
                               xmax = pairs[which.min(pairs$D), 2],
                               no_of_sims = boot,
                               threads = threads, 
                               seed = 123)
  
  bs_pl_sat_cut$p -> out$pars_real$pvalue#p-value
  
  pairs[which.min(pairs$D), 1] -> out$pars_real$k_sat
  pairs[which.min(pairs$D), 2] -> out$pars_real$k_cut
  pairs[which.min(pairs$D), 3] -> out$pars_real$D
  pairs[which.min(pairs$D), 4] -> out$pars_real$gamma
  
  
  
  pk_fit = data.frame(pk_fit = pk(ksat = out$pars_real$k_sat, 
                                  kcut = out$pars_real$k_cut, 
                                  gamma = out$pars_real$gamma, 
                                  kprime = data.s), 
                      k = data.s) 
  
  out$pk_fit = dplyr::left_join(out$data, pk_fit) %>%
    mutate(cump_kfit = cumsum(pk_fit)) %>%
    mutate(S_kfit = 1 - cump_kfit + pk_fit) 
  
  p_fit = out$pk_fit %>%
    ggplot() +
    aes(x = k, y = p_k) +
    geom_point(shape = "circle",
               size = 2, alpha = 0.5, color = col) +
    geom_line(aes(x = k, 
                  y=pk_fit), 
              linewidth = 2,
              color = "red") +
    scale_x_continuous(trans = "log10") +
    scale_y_continuous(trans = "log10") +
    labs(x="log10 (k)", 
         y="log10 (p_k)", 
         subtitle = paste("Gamma = ", round(out$pars_real$gamma,2),
                          "GOF = ", round(out$pars_real$D,2),
                          "\np = ", round(out$pars_real$pvalue,2), 
                          "k_sat = ", out$pars_real$k_sat, 
                          "k_cut = ", out$pars_real$k_cut
         )) +
    theme_minimal() +
    theme(legend.position = "none", 
          text = element_text(size = 18),
          axis.title  = element_text(
            face = "bold", 
            hjust = 0.5), 
          axis.title.x.top =element_text(
            face = "bold", 
            hjust = 0.5), 
          strip.text = element_text(
            face = "bold", 
            hjust = 0.5))
  
  
  out$plots = list(p_k_min,
                   p_gamma, 
                   p_cdf, 
                   p_gamma_boot, 
                   p_kmin_boot, 
                   p_detect_pairs, 
                   p_fit)
  
  out$boots = list(p = bs_pl$p, 
                   gof = bs_pl$gof)
  
  out$boots_real = list(p = bs_pl_sat_cut$p, 
                        gof = bs_pl_sat_cut$gof)
  message("\nDone.")
  return(out)
}
B = 1000
## gPPI PC
PLD_PPI_PC  = PLD(g = gPPI, 
                  category = "Protein Coding", 
                  boot = B)
We detected 10 threads. All will be used.
Calculating k_min.
Goodness-of-fit for k_min.
Expected total run time for 1000 sims, using 10 threads is 358 seconds.
Defining k_cut and k_sat.
Goodness-of-fit for k_cut and k_sat.
Some of your data is larger than xmax. The xmax parameter is
            the upper bound of the xmin search space. You could try increasing
            it. If the estimated values are below xmax, it's probably OK not to
            worry about this.
Expected total run time for 1000 sims, using 10 threads is 6.36 seconds.
Joining, by = "k"
Done.
## gPPI TF
PLD_PPI_TF  = PLD(g = gPPI, 
                  category = "TF", 
                  boot = B)
We detected 10 threads. All will be used.
Calculating k_min.
Goodness-of-fit for k_min.
Expected total run time for 1000 sims, using 10 threads is 287 seconds.
Defining k_cut and k_sat.
Goodness-of-fit for k_cut and k_sat.
Some of your data is larger than xmax. The xmax parameter is
            the upper bound of the xmin search space. You could try increasing
            it. If the estimated values are below xmax, it's probably OK not to
            worry about this.
Expected total run time for 1000 sims, using 10 threads is 1.71 seconds.
Joining, by = "k"
Done.
## gPPInc PC
PLD_PPINC_PC  = PLD(g = gPPInc, 
                    category = "Protein Coding", 
                    boot = B)
We detected 10 threads. All will be used.
Calculating k_min.
Goodness-of-fit for k_min.
Expected total run time for 1000 sims, using 10 threads is 607 seconds.
Defining k_cut and k_sat.
Goodness-of-fit for k_cut and k_sat.
Some of your data is larger than xmax. The xmax parameter is
            the upper bound of the xmin search space. You could try increasing
            it. If the estimated values are below xmax, it's probably OK not to
            worry about this.
Expected total run time for 1000 sims, using 10 threads is 5.19 seconds.
Joining, by = "k"
Done.
## gPPInc TF
PLD_PPINC_TF  = PLD(g = gPPInc, 
                    category = "TF", 
                    boot = B)
We detected 10 threads. All will be used.
Calculating k_min.
Goodness-of-fit for k_min.
Expected total run time for 1000 sims, using 10 threads is 444 seconds.
Defining k_cut and k_sat.
Goodness-of-fit for k_cut and k_sat.
Some of your data is larger than xmax. The xmax parameter is
            the upper bound of the xmin search space. You could try increasing
            it. If the estimated values are below xmax, it's probably OK not to
            worry about this.
Expected total run time for 1000 sims, using 10 threads is 3.88 seconds.
Joining, by = "k"
Done.
## gPPInc lncRNA
PLD_PPINC_lncRNA  = PLD(g = gPPInc, 
                        category = "lncRNA", 
                        boot = B)
We detected 10 threads. All will be used.
Calculating k_min.
Goodness-of-fit for k_min.
Expected total run time for 1000 sims, using 10 threads is 26.5 seconds.
Defining k_cut and k_sat.
Goodness-of-fit for k_cut and k_sat.
Some of your data is larger than xmax. The xmax parameter is
            the upper bound of the xmin search space. You could try increasing
            it. If the estimated values are below xmax, it's probably OK not to
            worry about this.
Expected total run time for 1000 sims, using 10 threads is 1.12 seconds.
Joining, by = "k"
Done.
## gPPInc miRNA
PLD_PPINC_miRNA  = PLD(g = gPPInc, 
                       category = "miRNA", 
                       boot = B)
We detected 10 threads. All will be used.
Calculating k_min.
Goodness-of-fit for k_min.
Expected total run time for 1000 sims, using 10 threads is 1020 seconds.
Defining k_cut and k_sat.
Goodness-of-fit for k_cut and k_sat.
Some of your data is larger than xmax. The xmax parameter is
            the upper bound of the xmin search space. You could try increasing
            it. If the estimated values are below xmax, it's probably OK not to
            worry about this.
Expected total run time for 1000 sims, using 10 threads is 1.81 seconds.
Joining, by = "k"
Done.
## gPPInc Pseudogene
PLD_PPINC_pseudo  = PLD(g = gPPInc, 
                        category = "Pseudogene", 
                        boot = B)
We detected 10 threads. All will be used.
Calculating k_min.
Goodness-of-fit for k_min.
Expected total run time for 1000 sims, using 10 threads is 38.2 seconds.
Defining k_cut and k_sat.
Goodness-of-fit for k_cut and k_sat.
Some of your data is larger than xmax. The xmax parameter is
            the upper bound of the xmin search space. You could try increasing
            it. If the estimated values are below xmax, it's probably OK not to
            worry about this.
Expected total run time for 1000 sims, using 10 threads is 1.64 seconds.
Joining, by = "k"
Done.
## gPPInc other
PLD_PPINC_other  = PLD(g = gPPInc, 
                       category = "other", 
                       boot = B)
We detected 10 threads. All will be used.
Calculating k_min.
Goodness-of-fit for k_min.
Expected total run time for 1000 sims, using 10 threads is 46.7 seconds.
Defining k_cut and k_sat.
Goodness-of-fit for k_cut and k_sat.
Some of your data is larger than xmax. The xmax parameter is
            the upper bound of the xmin search space. You could try increasing
            it. If the estimated values are below xmax, it's probably OK not to
            worry about this.
Expected total run time for 1000 sims, using 10 threads is 0.945 seconds.
Joining, by = "k"
Done.
Cairo::CairoPDF("../figs/PL_PPI_PC.pdf", width = 15, height = 10)
PLD_PPI_PC$plots[[1]] + PLD_PPI_PC$plots[[2]] + PLD_PPI_PC$plots[[4]] + PLD_PPI_PC$plots[[5]] + PLD_PPI_PC$plots[[6]] + PLD_PPI_PC$plots[[7]] + plot_annotation(tag_levels =  "A", title = "PPI - Protein Coding")
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

dev.off()
quartz_off_screen 
                2 
Cairo::CairoPDF("../figs/PL_PPI_TF.pdf", width = 15, height = 10)
PLD_PPI_TF$plots[[1]] + PLD_PPI_TF$plots[[2]] +  PLD_PPI_TF$plots[[4]] + PLD_PPI_TF$plots[[5]] + PLD_PPI_TF$plots[[6]] +
  PLD_PPI_TF$plots[[7]] + plot_annotation(tag_levels =  "A", title = "PPI - TF")
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

dev.off()
quartz_off_screen 
                2 
Cairo::CairoPDF("../figs/PL_PPINC_PC.pdf", width = 15, height = 10)
PLD_PPINC_PC$plots[[1]] + PLD_PPINC_PC$plots[[2]] + PLD_PPINC_PC$plots[[4]] + PLD_PPINC_PC$plots[[5]] + PLD_PPINC_PC$plots[[6]] + PLD_PPINC_PC$plots[[7]] + plot_annotation(tag_levels =  "A", title = "PPI & NCI - Protein Coding")
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

dev.off()
quartz_off_screen 
                2 
Cairo::CairoPDF("../figs/PL_PPINC_TF.pdf", width = 15, height = 10)
PLD_PPINC_TF$plots[[1]] + PLD_PPINC_TF$plots[[2]] +  PLD_PPINC_TF$plots[[4]] + PLD_PPINC_TF$plots[[5]] + PLD_PPINC_TF$plots[[6]] + PLD_PPINC_TF$plots[[7]] + plot_annotation(tag_levels =  "A", title = "PPI & NCI - TF")
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

dev.off()
quartz_off_screen 
                2 
Cairo::CairoPDF("../figs/PL_PPINC_lncRNA.pdf", width = 15, height = 10)
PLD_PPINC_lncRNA$plots[[1]] + PLD_PPINC_lncRNA$plots[[2]] + PLD_PPINC_lncRNA$plots[[4]] + PLD_PPINC_lncRNA$plots[[5]] + PLD_PPINC_lncRNA$plots[[6]] + PLD_PPINC_lncRNA$plots[[7]] + plot_annotation(tag_levels =  "A", title = "PPI & NCI - lncRNA")
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

dev.off()
quartz_off_screen 
                2 
Cairo::CairoPDF("../figs/PL_PPINC_miRNA.pdf", width = 15, height = 10)
PLD_PPINC_miRNA$plots[[1]] + PLD_PPINC_miRNA$plots[[2]] + PLD_PPINC_miRNA$plots[[4]] + PLD_PPINC_miRNA$plots[[5]] + PLD_PPINC_miRNA$plots[[6]] + PLD_PPINC_miRNA$plots[[7]] + plot_annotation(tag_levels =  "A", title = "PPI & NCI - miRNA")
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

dev.off()
quartz_off_screen 
                2 
Cairo::CairoPDF("../figs/PL_PPINC_pseu.pdf", width = 15, height = 10)
PLD_PPINC_pseudo$plots[[1]] + PLD_PPINC_pseudo$plots[[2]] + PLD_PPINC_pseudo$plots[[4]] + PLD_PPINC_pseudo$plots[[5]] + PLD_PPINC_pseudo$plots[[6]] + 
  PLD_PPINC_pseudo$plots[[7]] + plot_annotation(tag_levels =  "A", title = "PPI & NCI - Pseudogenes")
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

dev.off()
quartz_off_screen 
                2 
Cairo::CairoPDF("../figs/PL_PPINC_other.pdf", width = 15, height = 10)
PLD_PPINC_other$plots[[1]] + PLD_PPINC_other$plots[[2]] + PLD_PPINC_other$plots[[4]] + PLD_PPINC_other$plots[[5]] + PLD_PPINC_other$plots[[6]] +
  PLD_PPINC_other$plots[[7]] +plot_annotation(tag_levels =  "A", title = "PPI & NCI - Other")
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

dev.off()
quartz_off_screen 
                2 
PLD_PPI = PLD(g = gPPI, 
              category = unique(V(gPPI)$Category_cl), 
              col = "#8D3B72")
We detected 10 threads. All will be used.
Warning in Category_cl == category: longer object length is not a multiple of
shorter object length
Calculating k_min.
Goodness-of-fit for k_min.
Expected total run time for 1000 sims, using 10 threads is 351 seconds.
Defining k_cut and k_sat.
Goodness-of-fit for k_cut and k_sat.
Some of your data is larger than xmax. The xmax parameter is
            the upper bound of the xmin search space. You could try increasing
            it. If the estimated values are below xmax, it's probably OK not to
            worry about this.
Expected total run time for 1000 sims, using 10 threads is 3.64 seconds.
Joining, by = "k"
Done.
PLD_NCPPI = PLD(g = gPPInc, 
                category = unique(V(gPPInc)$Category_cl), 
                col = "#72E1D1")
We detected 10 threads. All will be used.
Warning in Category_cl == category: longer object length is not a multiple of
shorter object length
Calculating k_min.
Goodness-of-fit for k_min.
Expected total run time for 1000 sims, using 10 threads is 313 seconds.
Defining k_cut and k_sat.
Goodness-of-fit for k_cut and k_sat.
Some of your data is larger than xmax. The xmax parameter is
            the upper bound of the xmin search space. You could try increasing
            it. If the estimated values are below xmax, it's probably OK not to
            worry about this.
Expected total run time for 1000 sims, using 10 threads is 2.15 seconds.
Joining, by = "k"
Done.
fits = list()

fits[[1]] = PLD_PPI_PC$pk_fit %>% 
  mutate(network = "PPI", type = "Protein Coding")
fits[[1]] %<>%  
  bind_cols(., PLD_PPI_PC$pars_real %>% 
  as.data.frame())

fits[[2]] = PLD_PPINC_PC$pk_fit %>% 
  mutate(network = "PPI & NCI", type = "Protein Coding")
fits[[2]] %<>%  
  bind_cols(., PLD_PPINC_PC$pars_real %>% 
  as.data.frame())

fits[[3]] = PLD_PPI_TF$pk_fit %>% 
  mutate(network = "PPI", type = "TF")
fits[[3]] %<>%  
  bind_cols(., PLD_PPI_TF$pars_real %>% 
  as.data.frame())

fits[[4]] = PLD_PPINC_TF$pk_fit %>% 
  mutate(network = "PPI & NCI", type = "TF")
fits[[4]] %<>%  
  bind_cols(., PLD_PPINC_TF$pars_real %>% 
  as.data.frame())

fits[[5]] = PLD_PPINC_lncRNA$pk_fit %>% 
  mutate(network = "PPI & NCI", type = "lncRNA")
fits[[5]] %<>%  
  bind_cols(., PLD_PPINC_lncRNA$pars_real %>% 
  as.data.frame())

fits[[6]] = PLD_PPINC_miRNA$pk_fit %>% 
  mutate(network = "PPI & NCI", type = "miRNA")
fits[[6]] %<>%  
  bind_cols(., PLD_PPINC_miRNA$pars_real %>% 
  as.data.frame())

fits[[7]] = PLD_PPINC_other$pk_fit %>% 
  mutate(network = "PPI & NCI", type = "other")
fits[[7]] %<>%  
  bind_cols(., PLD_PPINC_other$pars_real %>% 
  as.data.frame())

fits[[8]] = PLD_PPINC_pseudo$pk_fit %>% 
  mutate(network = "PPI & NCI", type = "Pseudogene")
fits[[8]] %<>%  
  bind_cols(., PLD_PPINC_pseudo$pars_real %>% 
  as.data.frame())


fits[[9]] = PLD_NCPPI$pk_fit %>% 
  mutate(network = "PPI & NCI", type = "PPI & NCI")
fits[[9]] %<>%  
  bind_cols(., PLD_NCPPI$pars_real %>% 
  as.data.frame())

fits[[10]] = PLD_PPI$pk_fit %>% 
  mutate(network = "PPI", type = "PPI")
fits[[10]] %<>%  
  bind_cols(., PLD_PPI$pars_real %>% 
  as.data.frame())

fits %<>% bind_rows()
p_sk = fits %>%
  ggplot() +
  aes(x = k, y = S_k, colour = type) +
  geom_point(pch = 19, size = 1.5, alpha = 0.2) +
  geom_line(linewidth = 2, aes(x = k, y = S_kfit), col = "red") +
  scale_color_manual(values = c(PPI_colors, values)) +
  scale_x_continuous(trans = "log10", , labels = scales::label_log()) +
  scale_y_continuous(trans = "log10", limits = c(min(fits$S_k), 1), labels = scales::label_log()) +
  theme_minimal() +
  facet_wrap(vars(network, type))+
  labs(x="log10 (k) ", y="log10(1 - CDF)") +
  theme(legend.position = "bottom", 
        text = element_text(size = 18),
        axis.title  = element_text(
          face = "bold", 
          hjust = 0.5), 
        axis.title.x.top =element_text(
          face = "bold", 
          hjust = 0.5), 
        strip.text = element_text(
          face = "bold", 
          hjust = 0.5))
Cairo::CairoPDF("../figs/FigS4.pdf", width = 15, height = 10)
p_sk
Warning: Removed 4 rows containing missing values (`geom_line()`).

dev.off()
quartz_off_screen 
                2 
## Get pars
## 
fits_ps = list()

fits_ps[[1]] = PLD_PPI_PC$pars_real %>% as.data.frame() %>% 
  mutate(network = "PPI", type = "Protein Coding")

fits_ps[[2]] = PLD_PPINC_PC$pars_real %>% as.data.frame() %>% 
  mutate(network = "PPI & NCI", type = "Protein Coding")

fits_ps[[3]] = PLD_PPI_TF$pars_real %>% as.data.frame() %>% 
  mutate(network = "PPI", type = "TF")

fits_ps[[4]] = PLD_PPINC_TF$pars_real %>% as.data.frame() %>% 
  mutate(network = "PPI & NCI", type = "TF")

fits_ps[[5]] = PLD_PPINC_lncRNA$pars_real %>% as.data.frame() %>% 
  mutate(network = "PPI & NCI", type = "lncRNA")

fits_ps[[6]] = PLD_PPINC_miRNA$pars_real %>% as.data.frame() %>% 
  mutate(network = "PPI & NCI", type = "miRNA")

fits_ps[[7]] = PLD_PPINC_other$pars_real %>% as.data.frame() %>% 
  mutate(network = "PPI & NCI", type = "other")

fits_ps[[8]] = PLD_PPINC_pseudo$pars_real %>% as.data.frame() %>% 
  mutate(network = "PPI & NCI", type = "Pseudogene")


fits_ps[[9]] = PLD_NCPPI$pars_real %>% as.data.frame() %>% 
  mutate(network = "PPI & NCI", type = "PPI & NCI")

fits_ps[[10]] = PLD_PPI$pars_real %>% as.data.frame() %>% 
  mutate(network = "PPI", type = "PPI")


fits_ps %<>% bind_rows()

Table S6: Power Law Gamma

fits_ps %>% 
  select(network, type, gamma, pvalue) %>% 
  knitr::kable()
network type gamma pvalue
PPI Protein Coding 1.668377 0.552
PPI & NCI Protein Coding 1.536021 0.546
PPI TF 1.544926 0.530
PPI & NCI TF 1.443738 0.511
PPI & NCI lncRNA 1.983666 0.511
PPI & NCI miRNA 1.423850 0.641
PPI & NCI other 1.938215 0.451
PPI & NCI Pseudogene 1.764790 0.000
PPI & NCI PPI & NCI 1.525346 0.582
PPI PPI 1.639540 0.497

Figure S4: Gene completion mapped in the PPI and the combined network.

##############################
### Understand the percentage of genes
### that are annotated and also can be found
### in the networks. 
##############################

Classificate_Summary = gPPInc %>% 
  igraph::as_data_frame(., what = "vertices") %>%
  mutate(Category_Summarised = ifelse(Category_cl %in% 
                                        c("TF", "Protein Coding"), "Protein Coding", as.character(Category_cl))) %>%
  mutate(Category_Summarised = ifelse(Category_Summarised %in% 
                                        c("lncRNA", "miRNA", "ncRNA"), "Non Coding", as.character(Category_Summarised)))

Classificate_Summary %>% 
  group_by(Category_Summarised) %>%
  summarise(n = n()) %>%
  ungroup() %>%
  mutate(`%` = round(n/sum(n) * 100, 2))
# A tibble: 4 × 3
  Category_Summarised     n   `%`
  <chr>               <int> <dbl>
1 Non Coding           3784 14.2 
2 other                1004  3.78
3 Protein Coding      18358 69.1 
4 Pseudogene           3429 12.9 
dist_genesncppi = gPPInc %>% 
  igraph::as_data_frame(., "vertices") %>%
  ungroup() %>% 
  select(Symbol = name)  %>%
  unique() %>% 
  mutate(NCIPPI = "Yes") %>% 
  dplyr::full_join(., Annot_Dic) %>%
  mutate(NCIPPI = ifelse(is.na(NCIPPI), "No", NCIPPI)) %>%
  group_by(Category_cl, NCIPPI) %>%
  summarise(n = n())  %>%
  arrange(n)

Table_dist_genesppi = dist_genesncppi %>%
  pivot_wider(names_from = NCIPPI, values_from = n) %>%
  mutate(`% retrieved` = Yes/(Yes + No) * 100) %>%
  mutate(total = (Yes + No)) %>%
  arrange(total)

Table_dist_genesppi$Category_cl %<>% factor(., levels = Table_dist_genesppi$Category_cl)


Table2 = Annot_Dic %>%
  select(-color)%>% 
  mutate(`PPI` = ifelse(Symbol %in% V(gPPI)$name, "Yes", "No")) %>%
  mutate(`NCI & PPI` = ifelse(Symbol %in% V(gPPInc)$name, "Yes", "No")) %>%
  select(-Symbol) %>% 
  pivot_longer(-Category_cl) %>%
  group_by(Category_cl, name, value) %>%
  summarise(n = n()) %>%
  arrange(n)

l = Table2 %>% 
  filter(name == "NCI & PPI") %>%
  group_by(Category_cl) %>%
  summarise(Total = sum(n)) %>%
  arrange(Total) 

Table2$Category_cl %<>% factor(., levels = l$Category_cl)
Table2 %<>% dplyr::inner_join(., l) %>%
  mutate(pct = round(n/Total * 100,1)) %>%
  mutate(pct_lab = ifelse(value == "No", NA, pct))

lev = Table2 %>% 
  ungroup %>% 
  select(Category_cl, Total) %>% 
  unique() %>%
  arrange(Total)

Table2$Category_cl %<>% factor(., levels = lev$Category_cl) 
Fig_s4 = Table2 %>%
  mutate(name = factor(name, 
                       levels = c("PPI","NCI & PPI"), 
                       labels = c("PPI","PPI & NCI"))) %>%
  mutate(pct_lab = ifelse(!is.na(pct_lab), paste(pct_lab, "%"), "")) %>% 
  ggplot() +
  aes(x = Category_cl, 
      fill = value, 
      weight = n) +
  geom_bar() +
  geom_text(aes(label = pct_lab), 
            stat = "count", 
            hjust = 1,
            colour = "#a4133c") +
  scale_fill_manual(
    values = c(No = "#ffccd5",
               Yes = "#ff4d6d")
  ) +
  coord_flip() +
  theme_minimal() +
  theme(legend.position = "bottom", 
        text = element_text(size = 18),
        axis.title  = element_text(face = "bold",
                                   hjust = 0.5), 
        axis.title.x.top = element_text(face = "bold", 
                                        hjust = 0.5), 
        strip.text = element_text(face = "bold", 
                                  hjust = 0.5)) +
  facet_wrap(vars(name)) +
  labs(y = "# Genes", 
       x = "Gene Category", 
       fill = "In the data base")
Cairo::CairoPDF(paste0("../figs/Fig_S4.pdf"), 
                height = 5, 
                width = 15)

Fig_s4

dev.off()
quartz_off_screen 
                2 

Figure 1: The role of ncRNA in gene regulation and connection to the human interactome.

##############################
### Get Genes Category in the neworks to understand the
### pattern of gene-gene regulation
### build a network to viz results
##############################

gPPInc %<>% 
  simplify(., remove.multiple = F, remove.loops = TRUE)
gPPI %<>% 
  simplify(., remove.multiple = F, remove.loops = TRUE)

edges = gPPInc %>% 
  as_data_frame(., what = "edges") %>%
  inner_join(Annot_Dic, by = c(from = "Symbol")) %>%
  rename(from.type = Category_cl) %>% 
  inner_join(Annot_Dic, by = c(to = "Symbol")) %>%
  rename(to.type = Category_cl) %>% 
  select(from.type, to.type) %>%
  OrderNames() %>% 
  group_by(from.type, to.type) %>%
  summarise(n = n()) %>%
  ungroup()

nodes = gPPInc %>% 
  simplify(., remove.multiple = F, remove.loops = TRUE) %>%
  as_data_frame(., what = "vertices") %>%
  group_by(Category_cl) %>%
  summarise(n = n()) %>%
  ungroup() 

########################### 
#### Export Networks
###########################
save(gPPI, gPPInc, GDA, Annot_Dic, file = "../data/output/graphs.RData")

fig1 = graph_from_data_frame(edges, directed = F, vertices = nodes)
V(fig1)$size = (V(fig1)$n/max(V(fig1)$n) + 0.1) * 50
E(fig1)$width = (E(fig1)$n/max(E(fig1)$n) + 0.1) * 10

Cairo::CairoPDF(paste("../figs/Fig_01.pdf"), width = 12, height = 12)
par(mar=c(0,0,0,0))
plot(fig1)

dev.off()
quartz_off_screen 
                2 

Text Results: We find that 250 of the 861 diseases are enriched for ncRNAs.

##############################
### Perform Proportion's test and 
### multiple correction
##############################
Totals = GDA %>% 
  mutate(category = ifelse(Category_cl %in% c("Protein Coding", "TF"), "PC", as.character(Category_cl))) %>%
  mutate(category = ifelse(category %in% c('lncRNA',
                                           'miRNA',
                                           'ncRNA'),
                           "NC",
                           category)) %>%
  filter(category %in% c("PC", "NC")) %>% 
  select(category, hgnc_symbol) %>%
  unique() %>%
  group_by(category) %>%
  summarise(n = n()) %>%
  pivot_wider(names_from = category, 
              values_from = n)

GDA2Prop = GDA %>% 
  mutate(category = ifelse(Category_cl %in% c("Protein Coding", "TF"), "PC", as.character(Category_cl))) %>%
  mutate(category = ifelse(category %in% c('lncRNA',
                                           'miRNA',
                                           'ncRNA'),
                           "NC",
                           category)) %>%
  filter(category %in% c("PC", "NC")) %>% 
  select(DiseaseName, category, hgnc_symbol) %>%
  unique() %>%
  group_by(DiseaseName, category) %>%
  summarise(n = n()) %>%
  pivot_wider(names_from = category, 
              values_from = n, 
              values_fill = 0)  %>% 
  arrange(desc(NC)) %>%
  mutate(Total_PC = Totals$PC, 
         Total_NC = Totals$NC)

GDA2Prop[is.na(GDA2Prop)]<- 0
fishers.test = function(base = GDA2Prop){
  tmp = list()
  for(i in 1:nrow(base)){
    mat = matrix(c(base$PC[i], 
                   base$NC[i],  (base$Total_PC[i] - base$PC[i]), 
                   
                   (base$Total_NC[i] - base$NC[i])), 
                 byrow = TRUE, 
                 ncol = 2)
    p = fisher.test(mat, alternative = "less")$p.value
    
    tmp[[i]] = data.frame(disease= base$DiseaseName[i], p = p)
  }
  tmp = bind_rows(tmp)
  return(tmp)
}

props.test = function(base = GDA2Prop){
  tmp = list()
  for(i in 1:nrow(base)){
    p = suppressWarnings(prop.test(c(base$NC[i], base$PC[i]),
                                   c(base$Total_NC[i], base$Total_PC[i]), 
                                   alternative = "greater", 
                                   correct = TRUE))$p.value
    
    tmp[[i]] = data.frame(disease= base$DiseaseName[i], p = p)
  }
  tmp = bind_rows(tmp)
  return(tmp)
}

enrich = props.test()
enrich %<>%
  mutate(padj = p.adjust(p, method = "fdr"))
enrich %>% 
  filter(padj < 0.05) %>% 
  nrow()
[1] 208

Figure 2: Disease modules for Rheumatoid Arthritis (RA), Crohn’s Disease (CD), and Pre-Eclampsia (PE).

##############################
### Build the viz of the disease modules
##############################
leg = data.frame(color = V(gPPInc)$color, 
                 type = V(gPPInc)$Category_cl) %>% 
  unique()
RAPD = plot_diseases("arthritis rheumatoid")
CDPD = plot_diseases("crohn disease")
PEPD = plot_diseases("pre eclampsia")
Cairo::CairoPDF(paste("../figs/Fig_02.pdf"), width = 12, height = 12)
graphics::layout(mat = matrix(c(1,2,3,
                                4,5,6, 
                                7,7,7), ncol = 3, byrow = T), heights = c(3,3,1))
par(mar = c(0,0,2,0))

g = RAPD$PPI 
l = layout_with_fr(g, 
                   weights = E(g)$weight^1/20)
l <- norm_coords(l, ymin=-1, 
                 ymax=1, 
                 xmin=-1, 
                 xmax=1)
g %>%
  plot(., 
       vertex.size = ifelse(degree(.) == 0, 0.5, (V(.)$size +0.2) *5),
       vertex.label.cex = (V(.)$size +0.2)*1,
       edge.curved = 0.1,
       edge.width = E(.)$width*2 ,
       edge.weight = E(.)$weight^100, 
       layout = l)
title("Rheumatoid Arthritis - PPI")

g = CDPD$PPI 
l = layout_with_fr(g, 
                   weights = E(g)$weight^1/20)
l <- norm_coords(l, ymin=-1, 
                 ymax=1, 
                 xmin=-1, 
                 xmax=1)

g %>%
  plot(., vertex.size = ifelse(degree(.) == 0, 0.5, (V(.)$size +0.2) *5),
       vertex.label.cex = (V(.)$size +0.2)*1,
       # vertex.label = NA,
       edge.curved = 0.1,
       edge.width = E(.)$width*2 ,
       edge.weight = E(.)$weight^100, 
       layout = l)
title("Crohn Disease - PPI")


g = PEPD$PPI 
l = layout_with_fr(g, 
                   weights = E(g)$weight^1/20)
l <- norm_coords(l, ymin=-1, 
                 ymax=1, 
                 xmin=-1, 
                 xmax=1)

g %>%
  plot(., vertex.size = ifelse(degree(.) == 0, 0.5, (V(.)$size +0.2) *5),
       vertex.size = (V(.)$size +0.2) *5,
       vertex.label.cex = (V(.)$size +0.2)*1,
       # vertex.label = NA,
       edge.curved = 0.1,
       edge.width = E(.)$width*2 ,
       edge.weight = E(.)$weight^100)
title("Pre Eclampsia - PPI")


g = RAPD$NCIPPI %>% 
  delete.edges(E(.)[E(.)$width^1.65 < 0.01]) %>%
  delete.edges(E(.)[E(.)$width^1/2 < 0.03]) 
l = layout_with_fr(g, 
                   weights = E(g)$weight^1/20)
l <- norm_coords(l, ymin=-1, 
                 ymax=1, 
                 xmin=-1, 
                 xmax=1)

g %>%
  plot(., vertex.size = ifelse(degree(.) == 0, 0.5, (V(.)$size +0.2) *5),
       vertex.label.cex = (V(.)$size +0.2)*1,
       # vertex.label = NA,
       edge.curved = 0.1,
       edge.width = (E(.)$width^2),
       edge.weight = E(.)$weight^1/20,
       layout = l)
title("Rheumatoid Arthritis - PPI & NCI")

g = CDPD$NCIPPI %>% 
  delete.edges(E(.)[E(.)$width^1.65 < 0.01]) %>%
  delete.edges(E(.)[E(.)$width^1/2 < 0.03]) 
l = layout_with_fr(g, 
                   weights = E(g)$weight^1/20)
l <- norm_coords(l, ymin=-1, 
                 ymax=1, 
                 xmin=-1, 
                 xmax=1)

g %>%
  plot(., vertex.size = ifelse(degree(.) == 0, 0.5, (V(.)$size +0.2) *5),
       vertex.label.cex = (V(.)$size +0.2)*1,
       # vertex.label = NA,
       edge.curved = 0.1,
       edge.width = (E(.)$width^2),
       edge.weight = E(.)$weight^1/20,
       layout = l)
title("Crohn Disease PPI & NCI")

g = PEPD$NCIPPI %>% 
  delete.edges(E(.)[E(.)$width^1.65 < 0.01]) %>%
  delete.edges(E(.)[E(.)$width^1/2 < 0.03]) 
l = layout_with_fr(g, 
                   weights = E(g)$weight^1/2)
# l <- norm_coords(l, ymin=-1, 
#                  ymax=1, 
#                  xmin=-1, 
#                  xmax=1)

g %>%
  plot(., vertex.size = ifelse(degree(.) == 0, 0.5, (V(.)$size +0.2) *5),
       vertex.label.cex = (V(.)$size +0.2)*1,
       # vertex.label = NA,
       edge.curved = 0.1,
       edge.width = (E(.)$width^2),
       edge.weight = E(.)$weight^1/20,
       layout = l)
title("Pre Eclampsia PPI & NCI")

plot(1, 1, xlim = c(20,30), 
     axes = F, 
     xlab = "",
     ylab = "")
legend("bottom",
       cex = 1.5,
       # xpd = TRUE,
       legend = leg$type, 
       col = leg$color, 
       pch = 19,
       bty = "n", 
       ncol = length(leg$color)
)

dev.off()
quartz_off_screen 
                2 

GDA databases

GDA_tmp = GDA %>% 
  select(DiseaseName, hgnc_symbol, evidence) 

GDA_tmp = GDA_tmp$evidence %>% 
  stringr::str_split(., "\\|", simplify = TRUE) %>%
  as.data.frame() %>% 
  cbind(GDA_tmp, . ) %>% 
  select(-evidence) %>% 
  pivot_longer(., -c(DiseaseName, hgnc_symbol), values_to = "database" ) %>% 
  filter(database != "") %>% 
  select(-name)

Figure S6: Disease Associated genes and their classification.

Figure S6A: Rainplot of Number of disease-associated genes classified by gene category

##############################
### Rainplot of Number of disease-associated 
### genes classified by gene category.
##############################

GDA_CAT = GDA %>% 
  select(DiseaseName, Category_cl, hgnc_symbol) %>%
  unique() %>%
  group_by(DiseaseName, Category_cl) %>%
  summarise(n = n()) %>%
  arrange(desc(n))



GDA_CAT$Category_cl %<>% factor(., 
                                levels = c("Protein Coding",
                                           "TF",
                                           "Pseudogene",
                                           "lncRNA",
                                           "miRNA",
                                           "other"))



fig_s6A = GDA_CAT %>% 
  filter(!(Category_cl %in% "other")) %>%
  ggplot() +
  aes(x = Category_cl, 
      y = n,
      fill = Category_cl, 
      color = Category_cl) +
  geom_half_violin(adjust = 1.5L,
                   scale = "width",
                   alpha = 0.5,
                   side = "r",
                   position = position_nudge(x = 0.35, y = 0)) +
  geom_point(
    position = position_jitterdodge(),
    shape = "circle",
    size = .25,
    alpha = 1) +
  geom_boxplot(shape = "circle", 
               alpha = 0.1, 
               width = 0.25, 
               outlier.shape = NA) +
  # expand_limits(x = 5.25) +
  scale_color_manual(values = values) +
  scale_fill_manual(values = values) +
  scale_y_continuous(trans = "log10") + 
  coord_flip() + 
  theme_minimal() +
  labs(x = "Gene Class",
       y = "# of disease associated genes") + 
  theme (legend.position = "none", 
         axis.title  = element_text(face = "bold", hjust = 0.5), 
         axis.title.x.top =element_text(face = "bold", hjust = 0.5), 
         strip.text = element_text(face = "bold", hjust = 0.5)); fig_s6A

Figure S6B: Degree distribution of gene-disease associations.

##############################
### Degree distribution of gene-disease associations.
##############################
DisDeg = GDA %>% 
  group_by(hgnc_symbol, Category_cl) %>%
  summarise(degree = n()) %>%
  ungroup() %>%
  group_by(Category_cl, degree) %>%
  summarise(n = n())

fig_s6B = DisDeg %>%
    filter(!(Category_cl %in% "other")) %>%
    ggplot() +
    aes(
        x = degree,
        y = n,
        fill = Category_cl,
        colour = Category_cl
    ) +
    geom_point(shape = "circle", size = 1.5, alpha = .5) +
    geom_smooth(span = 1, se= F, method = "gam") +
    scale_color_manual(values = values) +
    scale_fill_manual(values = 
                          values) +
    scale_x_continuous(trans = "log10") +
    scale_y_continuous(trans = "log10", limits  = c(1, 3000)) +
    theme_minimal() +
    theme(legend.position = "bottom") + 
    labs(x = "Degree", y = "#", fill = "Gene Category", color = "Gene Category")+
    theme (legend.position = "none", 
           axis.title  = element_text(face = "bold", hjust = 0.5), 
           axis.title.x.top =element_text(face = "bold", hjust = 0.5), 
           strip.text = element_text(face = "bold", hjust = 0.5)); fig_s6B

Figure S6C: The % of genes in each category found in the LCC.

##############################
### LCC is generated in 01_LCC_Sab.R
### The % of genes in each category found in the LCC.
##############################

Out = fread("../data/output/LCC.csv") 

Out %<>% filter(disease %in% GDA$DiseaseName)
Out %<>%
  group_by(graph) %>%
  mutate(padj = p.adjust(p, method = "fdr")) %>%
  ungroup() %>%
  mutate(graph = ifelse(graph == "gPPInc", "PPI & NCI", graph)) %>% 
  mutate(graph = ifelse(graph == "gPPI", "PPI", graph))



PPI_diseases = Out %>%
  arrange(disease) %>% 
  pull(disease) %>% unique()

pb <- progress_bar$new(
  format = "  Computed [:bar] :percent eta: :eta",
  total = length(PPI_diseases), 
  clear = FALSE,
  width= 60)

LCCs_Distribution_Genes = list()
for (i in 1:length(PPI_diseases)){
  pb$tick()
  tmp_g = GDA %>%
    filter(DiseaseName %in% PPI_diseases[i]) %>%
    select(hgnc_symbol, Category_cl)
  
  tmp_ppi = gPPI %>%
    induced.subgraph(., tmp_g$hgnc_symbol[tmp_g$hgnc_symbol %in% V(.)$name]) %>%
    extract_LCC() %>%
    as_data_frame("vertices") %>%
    mutate(disease = PPI_diseases[i]) %>%
    mutate(graph = "PPI")
  
  tmp_ncppi = gPPInc %>%
    induced.subgraph(., tmp_g$hgnc_symbol[tmp_g$hgnc_symbol %in% V(.)$name]) %>%
    extract_LCC() %>%
    as_data_frame("vertices") %>%
    mutate(disease = PPI_diseases[i]) %>%
    mutate(graph = "PPI & NCI")
  
  LCCs_Distribution_Genes[[i]] = bind_rows(tmp_ppi, tmp_ncppi)
}
LCCs_Distribution_Genes %<>% bind_rows()

LCCs_Distribution_Genes_summary = LCCs_Distribution_Genes %>%
  group_by(Category_cl, disease, graph) %>%
  summarise(in_LCC = n()) %>%
  rename(DiseaseName = disease) %>%
  pivot_wider(names_from = graph, 
              values_from =  in_LCC, 
              values_fill = 0, 
              names_prefix = "in_LCC_")


LCCs_Distribution_Genes_summary %<>%
  dplyr::left_join(., GDA_CAT) %>%
  mutate(prop_LCC_PPINCI = (`in_LCC_PPI & NCI`/n)*100, 
         prop_LCC_PPI = (`in_LCC_PPI`/n)*100)


LCCs_Distribution_Genes_summary_l = LCCs_Distribution_Genes_summary %>% 
  select(Category_cl, DiseaseName, starts_with("prop")) %>%
  pivot_longer(-c(1:2))
LCCs_Distribution_Genes_summary_l$Category_cl %<>% factor(., 
                                                          levels = c("Protein Coding",
                                                                     "TF",
                                                                     "Pseudogene",
                                                                     "lncRNA",
                                                                     "miRNA",
                                                                     "other"))


fig_s6C = LCCs_Distribution_Genes_summary_l %>%
  mutate(graph = ifelse(name == "prop_LCC_PPINCI", "PPI & NCI", "PPI")) %>% 
  ggplot() +
  aes(x = value,
      y = Category_cl, 
      fill = graph, 
      color = graph) +
  geom_boxplot(alpha = 0.5) +
  scale_fill_manual(values = PPI_colors) +
  scale_color_manual(values = PPI_colors) +
  labs(
    x = "% of genes in the LCC",
    y = "Gene Category",
    fill = "Network", 
    colour = "Network"
  ) +
  theme_minimal() +
  theme(legend.position = "bottom",
        axis.title  = element_text(
          face = "bold", 
          hjust = 0.5), 
        axis.title.x.top =element_text(
          face = "bold", 
          hjust = 0.5), 
        strip.text = element_text(
          face = "bold", 
          hjust = 0.5))
Cairo::CairoPDF("../figs/Fig_S6.pdf", width = 15, height = 6)
fig_s6A + fig_s6B + fig_s6C + 
  plot_annotation(tag_levels = 'A')

dev.off()
quartz_off_screen 
                2 

Figure 3: Uncovering Gene-Disease Associations.

Disease Modules are computed in 01_LCC_Sab.R.

Figure 3A: Euler Plot

##############################
### Overlap of significant diseases in the PPI and the PPI & NCI
### Euler plot: Similar to the usual Venn Diagram, 
### but with proportional areas
##############################

Euler = list()
Euler$PPI = Out %>%
  filter(padj < lccp) %>%
  filter(graph == "PPI") %>% 
  pull(disease) 
Euler$`PPINCI` = Out %>%
  filter(padj < lccp) %>%
  filter(graph == "PPI & NCI") %>% 
  pull(disease) 

require(ggforce)
teste_eu = eulerr::euler(Euler)
fig_3A = teste_eu$ellipses %>%
  mutate(r = a/2) %>%
  mutate(x = h) %>%
  mutate(y = k) %>% 
  mutate(total = c(35, 132)) %>% 
  mutate(labels = c("PPI", "PPI & NCI")) %>% 
  
  ggplot() +
  geom_circle(aes(x0 = x, y0 = y, r = r, 
                  fill=labels, 
                  color = labels), 
              alpha = .5) +
  geom_text(aes(x = x, y = y, label = labels)) +
  geom_text(aes(x = x, y = y - 0.5, label = total)) +
  scale_fill_manual(values = PPI_colors) + 
  scale_color_manual(values = PPI_colors) + 
  coord_fixed() +
  theme_void() + 
  theme(legend.position = "none", 
        text = element_text(size = 18, 
                            family = "HelveticaNeue"),
        strip.text = element_text(face = "bold", 
                                  hjust = 0.5, 
                                  family = "HelveticaNeue"))

fig_3A

Figure 3B: Dumbell Plot

##############################
### Dumbell plot for rLCC size in the PPI 
### and in the PPI & NCI
##############################

PPI_diseases = c("insulin resistance",
                 "astrocytoma",
                 "thymoma")

NCIPPI_diseases = c('alopecia', 'anxiety disorders', 
                    'autoimmune diseases', 'carcinoma endometrioid', 
                    'celiac disease', 
                    'cleft lip', 'dementia',
                    'diabetes gestational', 'glaucoma',
                    'myocarditis', 'polycystic ovary syndrome', 
                    'pre eclampsia', "testicular neoplasms",
                    "pancreatitis" , "osteochondrodysplasias" ,
                    "neutropenia" , "retinitis pigmentosa")

both_diseases = c('thrombosis', 'stroke', 
                  'psoriasis', 'parkinson disease', 
                  'osteoarthritis', 'neuroblastoma', 
                  'myocardial infarction', 'mood disorders',
                  'melanoma', 'inflammatory bowel diseases', 
                  'glioma', 'endometriosis', 'bipolar disorder')

Dis_class = GDA %>% 
  select(DiseaseName, DiseaseClass = DescriptorName) %>%
  unique() %>%
  rename(disease = DiseaseName)

Dis_class_dic = Dis_class$DiseaseClass %>%
  stringr::str_split("\\|", simplify = T) %>% 
  as.data.frame() %>%
  mutate(DiseaseName = Dis_class$disease) %>%
  pivot_longer(., - DiseaseName) %>%
  filter(value != "") %>%
  select( - name) %>%
  group_by(value) %>%
  mutate(n = n()) %>%
  ungroup() %>%
  group_by(DiseaseName) %>%
  mutate(max = max(n)) %>%
  filter(n == max) %>%
  mutate(max_dis = max(value)) %>%
  filter(max_dis == value) %>%
  unique() %>%
  select(1:2) %>%
  group_by(value) %>%
  mutate(n = n()) %>%
  mutate(DiseaseClass = 
           ifelse(n < 20, "other", value)) %>% 
  ungroup()


prepare_dumbell = Out %>% 
  dplyr::inner_join(Dis_class) %>% 
  mutate(rLCC = LCC/Targets * 100) %>% 
  select(disease, graph, rLCC, DiseaseClass, padj) %>%
  mutate(label = ifelse(disease %in% Interest, disease, NA)) %>%
  mutate(size = ifelse(disease %in% Interest, 0.5, 2)) %>%
  mutate(alpha = ifelse(disease %in% Interest, 0.2, 1))

levels_dis = prepare_dumbell %>%
  filter(disease %in% c(Interest, PPI_diseases, 
                        NCIPPI_diseases, both_diseases)) %>% 
  group_by(disease) %>%
  summarise(maxrLCC = max(rLCC)) %>%
  arrange(maxrLCC) %>% 
  pull(disease) %>% 
  unique()

fig_3B = prepare_dumbell %>%
  filter(disease %in% c(Interest, PPI_diseases, 
                        NCIPPI_diseases, both_diseases)) %>% 
  mutate(disease = factor(disease, levels = levels_dis)) %>% 
  mutate(sign = ifelse(padj < lccp, "padj < 0.05", "padj > 0.05")) %>% 
  ggplot(.) +
  aes(x = rLCC,
      y = disease, 
      color = graph) +  
  geom_vline(xintercept = 50, color = "#d62828", alpha = 0.3) + 
  geom_line(aes(group = disease), linewidth = 1, color = "#a1a5a4") +
  geom_point(shape = 19, size = 5, color = "white") + 
  geom_point(aes(shape = sign), size = 5) + 
  scale_color_manual(values = PPI_colors)+ 
  scale_x_continuous(limits = c(0, 100)) + 
  scale_shape_manual(values = c(19, 1))+
  theme_minimal()+
  theme(legend.position = "bottom") + 
  labs(x = "rLCC (%)", y = "Disease", shape = element_blank(), color = element_blank())+
  theme(legend.position = "none", 
        text = element_text(size = 18, 
                            family = "HelveticaNeue"),
        axis.title  = element_text(face = "bold",
                                   hjust = 0.5, 
                                   family = "HelveticaNeue"), 
        axis.title.x.top = element_text(face = "bold", 
                                        hjust = 0.5, 
                                        family = "HelveticaNeue"), 
        strip.text = element_text(face = "bold", 
                                  hjust = 0.5, 
                                  family = "HelveticaNeue"))
fig_3B

Figure 3C: Histogram

##############################
### Histogram of the LCCs in the PPI and in the PPI & NCI
##############################

fig_3C = Out %>%
  mutate(label = ifelse(disease %in% Interest, disease, NA)) %>% 
  mutate(graph = factor(graph,
                        levels = c("PPI",
                                   "PPI & NCI"))) %>%
  filter(padj < lccp) %>%
  filter(graph != "NCI") %>% 
  ggplot() +
  aes(x = LCC,
      fill = graph, 
      colour = graph,
      size = padj,
      label = label) +
  geom_histogram(bins = 100, alpha = 0.3) +
  geom_text_repel( y = 0, 
                   min.segment.length = 0, 
                   seed = 777, 
                   size = 5,
                   nudge_x = 2,
                   box.padding = 1,
                   nudge_y = 2,
                   segment.curvature = -0.1,
                   segment.ncp = 30,
                   segment.angle = 20) + 
  scale_fill_manual(
    values = PPI_colors
  ) +
  scale_color_manual(
    values = PPI_colors
  ) +
  scale_x_continuous(trans = "log10", 
                     labels = scales::label_number())+
  theme_minimal() +
  labs(y = "Frequency", 
       x = "LCC size") +
  facet_wrap(vars(graph), scales = "free_y", ncol = 1L)+
  theme(legend.position = "none", 
        text = element_text(size = 18, 
                            family = "HelveticaNeue"),
        axis.title  = element_text(face = "bold",
                                   hjust = 0.5, 
                                   family = "HelveticaNeue"), 
        axis.title.x.top = element_text(face = "bold", 
                                        hjust = 0.5, 
                                        family = "HelveticaNeue"), 
        strip.text = element_text(face = "bold", 
                                  hjust = 0.5, 
                                  family = "HelveticaNeue"));
fig_3C

Figure 3D: Scatterplot pvalue

##############################
### Scatterplot of the p-values 
### in the PPI and the PPI & NCI
##############################

colors = paste0("#",c("ef476f","f36a6d","f78c6b",
                      "fbaf69","fdc068","ffd166",
                      "88ae8c","4d9c9f",
                      "118ab2","0f96ae",
                      "0ca1a9","0ab594",
                      "06d6a0","a1a5a4"))

names(colors) = c(
  "Cardiovascular Diseases",
  "Congenital, Hereditary, and Neonatal Diseases and Abnormalities",
  "Digestive System Diseases" ,
  "Female Urogenital Diseases and Pregnancy Complications",
  "Hemic and Lymphatic Diseases",
  "Mental Disorders" ,
  "Musculoskeletal Diseases"  ,
  "Neoplasms"  ,
  "Nervous System Diseases",
  "Nutritional and Metabolic Diseases"  ,
  "Pathological Conditions, Signs and Symptoms"   ,
  "Respiratory Tract Diseases"   ,
  "Skin and Connective Tissue Diseases",
  "other"
)

colors_original = colors

labels_colors = c(
  "Cardiovascular",
  "Congenital",
  "Digestive" ,
  "Female",
  "Lymphatic",
  "Mental Disorders" ,
  "Musculoskeletal"  ,
  "Neoplasms"  ,
  "Nervous System",
  "Metabolic"  ,
  "Pathologies"   ,
  "Respiratory"   ,
  "Skin Tissue",
  "Other"
)

Out2 = Out %>%
  select(Targets,
         disease,
         graph,
         LCC,
         p,
         padj) %>%
  pivot_wider(names_from = c(graph),
              values_from = c(p, padj, LCC, Targets)) %>%
  na.exclude() %>%
  janitor::clean_names()



Out2 %<>% left_join(., Dis_class_dic %>% 
                      select(disease = DiseaseName, 
                             DiseaseClass = value)) %>%
  group_by(DiseaseClass) %>%
  mutate(n = n()) %>%
  mutate(DiseaseClass= ifelse(n > 15, DiseaseClass, "Other")) %>%
  ungroup()

fig_3D = Out2 %>%
  mutate(label = ifelse(disease %in% Interest, disease, NA)) %>% 
  mutate(p_ppi_nci = ifelse(padj_ppi_nci < 0.00001, 0.00001, padj_ppi_nci)) %>%
  mutate(p_ppi = ifelse(padj_ppi < 0.00001, 0.00001, padj_ppi)) %>%
  ggplot() +
  aes(y = p_ppi_nci,
      x = p_ppi,
      size = lcc_ppi_nci,
      label = label,
      color = DiseaseClass) +
  geom_abline(slope = 1, color = "gray80") +
  geom_hline(yintercept = 0.05, color = "#d62828", alpha = 0.3) +
  geom_vline(xintercept = 0.05, color = "#d62828", alpha = 0.3) +
  geom_point(shape = "circle",
             alpha = 0.5,
             show.legend = F
  ) +
  geom_text_repel(min.segment.length = 0, 
                  seed = 777, 
                  size = 5, nudge_x = .15,
                  box.padding = 0.5,
                  nudge_y = 1,
                  segment.curvature = -0.1,
                  segment.ncp = 3,
                  segment.angle = 20) + 
  scale_size(range = c(0,3), trans = "log2") +
  scale_y_continuous(trans = "log10",
                     labels = scales::label_scientific())+
  scale_x_continuous(trans = "log10",
                     labels = scales::label_scientific())+
  
  labs(y = "pvalue PPI & NCI (FDR corrected)", 
       x = "pvalue PPI (FDR corrected)",
       size = "LCC PPI & NCI",
       color = NULL) +
  scale_color_manual(values = colors)+
  theme_minimal() +
  theme(legend.position = "none", 
        text = element_text(size = 18, 
                            family = "HelveticaNeue"),
        axis.title  = element_text(face = "bold",
                                   hjust = 0.5, 
                                   family = "HelveticaNeue"), 
        axis.title.x.top = element_text(face = "bold", 
                                        hjust = 0.5, 
                                        family = "HelveticaNeue"), 
        strip.text = element_text(face = "bold", 
                                  hjust = 0.5, 
                                  family = "HelveticaNeue")) ; fig_3D

Figure 3E: Scatterplot Genes vs LCC

##############################
### Genes associated with disease and the LCC size
##############################

prepare_scatter_3e = GDA %>% 
  select(disease = DiseaseName, Total_Genes) %>% 
  unique() %>%
  inner_join(Out) %>% 
  mutate(rLCC = (LCC/Targets)*100) %>% 
  select(disease, Total_Genes, LCC, rLCC, graph) 

fig_3E = prepare_scatter_3e %>%
  group_by(graph) %>%
  mutate(max_rlcc = max(Total_Genes)) %>% 
  mutate(label = ifelse(Total_Genes == max_rlcc, graph, NA)) %>%
  mutate(label = ifelse(disease %in% Interest, disease, label)) %>%
  filter(!(graph %in% "NCI")) %>%
  ggplot() +
  aes(x = Total_Genes, 
      y = LCC, 
      size = rLCC, 
      colour = graph, 
      fill = graph, 
      label = label) +
  geom_point(shape = "circle", alpha = 0.5) +
  geom_text_repel(min.segment.length = 0, 
                  seed = 777, 
                  size = 5,
                  nudge_x = 1,
                  box.padding = 0.5,
                  nudge_y = 1,
                  segment.curvature = -0.1,
                  segment.ncp = 3,
                  segment.angle = 20) + 
  geom_smooth(se = TRUE) + 
  geom_abline(slope = 1, color = "gray80") +
  scale_color_manual(values = PPI_colors) + 
  scale_fill_manual(values = PPI_colors) + 
  scale_size(range = c(0, 3), guide = "none") + 
  theme_minimal() +
  labs(x = "# Genes", y = "LCC")+
  theme(legend.position = "none", 
        text = element_text(size = 18, 
                            family = "HelveticaNeue"),
        axis.title  = element_text(face = "bold",
                                   hjust = 0.5, 
                                   family = "HelveticaNeue"), 
        axis.title.x.top = element_text(face = "bold", 
                                        hjust = 0.5, 
                                        family = "HelveticaNeue"), 
        strip.text = element_text(face = "bold", 
                                  hjust = 0.5, 
                                  family = "HelveticaNeue")); fig_3E

Figure 3F: Half Violins

##############################
### Proportion of non-coding genes.
##############################
LCC_NC = Out2 %>%
  dplyr::inner_join(GDA2Prop, by = c("disease"= "DiseaseName")) %>%
  mutate(ratio_ncc = NC / (NC + PC))

l = LCC_NC %>%
  group_by(DiseaseClass) %>%
  summarise(min = min(ratio_ncc), 
            max = max(ratio_ncc),
            med = median(ratio_ncc)) %>%
  arrange(max, med) %>%
  select(DiseaseClass)

l %<>% dplyr::inner_join(., 
                         data.frame(DiseaseClass =
                                      names(colors), 
                                    label = labels_colors))
colors_original = colors

names(colors) = labels_colors
LCC_NC$DiseaseClass[LCC_NC$DiseaseClass %ni% l$DiseaseClass] <- "Other" 

fig_3F = LCC_NC %>% 
  mutate(label = 
           ifelse(disease %in% Interest, disease, NA)) %>% 
  mutate(DiseaseClass = factor(DiseaseClass, 
                               levels = c(l$DiseaseClass, "Other"), 
                               labels = c(l$label, "Other"))) %>% 
  ggplot() +
  aes(
    x = DiseaseClass,
    y = ratio_ncc * 100,
    fill = DiseaseClass,
    colour = DiseaseClass, 
    label = label
  ) +
  geom_half_violin( scale = "width",
                    side = "r",
                    alpha = 0.5) +
  geom_half_point(size = 0.3,
                  show.legend  = F,
                  width = 0.5) +
  geom_text_repel(min.segment.length = 0, 
                  seed = 777, 
                  size = 5,
                  nudge_x = 1,
                  box.padding = 0.5,
                  nudge_y = 1,
                  segment.curvature = -0.1,
                  segment.ncp = 3,
                  segment.angle = 20) + 
  labs(
    y = "Non Coding Genes Ratio (%)",
    x = "Disease Class",
    fill = NULL,
    color = NULL
  ) +
  scale_y_continuous(limits = c(0, 100)) + 
  scale_color_manual(values = colors) +
  scale_fill_manual(values = colors) +
  
  theme_minimal() +
  coord_flip() +
  theme(legend.position = "none", 
        text = element_text(size = 18, 
                            family = "HelveticaNeue"),
        axis.title  = element_text(face = "bold",
                                   hjust = 0.5, 
                                   family = "HelveticaNeue"), 
        axis.title.x.top = element_text(face = "bold", 
                                        hjust = 0.5, 
                                        family = "HelveticaNeue"), 
        strip.text = element_text(face = "bold", 
                                  hjust = 0.5, 
                                  family = "HelveticaNeue"))
fig_3F

Figure 4: Physically-Interacting genes are co-expressed.

The data is created in 02_Corr.R and performance is evaluated in 03_Corr_Performance.

Figure 5: Disease similarity on the two networks.

Disease Similarities are computed in 01_LCC_Sab.R.

Figure 5A: RA comorbidities

##############################
### Comorbidities data is derived from
### doi: 10.1371/journal.pcbi.1000353
##############################

TDN = fread("../data/input/Comorbidities_RR.csv") %>%
  janitor::clean_names() %>%
  rename(x = disease_1, 
         y = disease_2, 
         como_p = p_12) %>%
  rename(RR = `rel_risk_12`) %>%
  mutate(percent = (co_12/(prev_1 + prev_2 - co_12))*100) 

GeneOverlap = fread("../data/output/SAB_JAC.csv")
GeneOverlap %<>%
  mutate(neg = ifelse(Sab < 0, "sab<0", "sab>0")) %>% 
  group_by(graph, neg) %>%
  mutate(Sab_norm = CoDiNA::normalize(abs(Sab))) %>%
  ungroup() %>%
  select(-neg) %>% 
  group_by(graph) %>% 
  mutate(Jac_p = 1-p) %>%
  select(-p) %>% 
  rename(Sab_p = pvalue_lt) %>%
  mutate(graph = ifelse(graph == "gPPInc", "PPI & NCI", graph)) %>% 
  mutate(graph = ifelse(graph == "gPPI", "PPI", graph)) 


GeneOverlap2 = GeneOverlap %>%
  filter((x %in% Euler$PPI & y %in% Euler$PPI & graph %in% "PPI") | (x %in% Euler$PPINCI & y %in% Euler$PPINCI & graph %in% "PPI & NCI") ) %>% 
  left_join(TDN) %>% 
  group_by(graph) %>%
  mutate(como_padj = p.adjust(como_p, method = "fdr")) %>%
  mutate(Sab_padj = p.adjust(Sab_p, method = "fdr")) %>%
  mutate(Jac_padj = p.adjust(Jac_p, method = "fdr")) %>%
  ungroup() %>% 
  mutate(sign_sab_jac_sabn = ifelse(Jac_padj < overlapp &
                                      Sab < 0 &
                                      Sab_norm >= 0.05 &
                                      Sab_p < sabp & 
                                      Jaccard.Index > 0
                                    , "Jac & Sab & Sabn: sign", "Jac & Sab & Sabn: not sign")) %>%
  mutate(sign_sab_p = ifelse(Sab_p < sabp &
                               Sab < 0 , "sab: sign", "sab: not sign")) %>%
  mutate(sign_sab = ifelse(Sab < 0 , "sab < 0", "sab > 0")) %>%
  mutate(sign_jac = ifelse(Jaccard.Index > 0 & Jac_padj < overlapp , "jac == 0", "jac > 0")) %>%
  
  mutate(group = paste(x, y, sep = " -- ")) %>%
  mutate(sign_como = ifelse(como_padj < comop, "Como: sign", "Como: not sign")) %>%
  mutate(sign_sab_jac = ifelse(Jaccard.Index > 0 & Jac_p < overlapp & Sab < 0 & Sab_p < sabp , "Sab & Jac: sign", "Sab & Jac: not sign")) 


GeneOverlap2$graph %<>% 
  factor(., levels = c(  "PPI", "PPI & NCI" ))

GeneOverlap_raw = GeneOverlap2  %>%  
  filter(!is.na(RR)) 

table(GeneOverlap_raw$sign_sab_jac_sabn, GeneOverlap_raw$sign_como)
                            
                             Como: not sign Como: sign
  Jac & Sab & Sabn: not sign           8750       7251
  Jac & Sab & Sabn: sign                193        279
nodes_sab_alt = Out %>% 
  filter(padj < lccp) %>%
  filter(!(disease %like% "experimental")) %>% 
  filter(!(disease %like% "chronic")) %>% 
  filter(!(disease %like% "diffuse")) %>% 
  filter(graph %in% c("PPI", "PPI & NCI")) %>%
  mutate(rLCC = LCC/Targets) %>% 
  select(disease, graph, rLCC) %>% 
  pivot_wider(names_from = graph, 
              values_from = rLCC, 
              names_prefix = "rLCC") %>%
  mutate(PPI_sig = ifelse(!is.na(rLCCPPI),1,0))%>%
  mutate(PPI_NCI_sig = ifelse(!is.na(`rLCCPPI & NCI`),1,0)) %>%
  dplyr::left_join(., Dis_class_dic, by = c("disease" = "DiseaseName")) %>% 
  dplyr::left_join(., data.frame(DiseaseClass = names(colors_original), color = colors_original), by = c("DiseaseClass" = "DiseaseClass")) 

Sabs_Alternative = GeneOverlap2 %>% 
  filter(graph %in% c("PPI", "PPI & NCI")) %>%
  mutate(PPI_DIS = ifelse(graph == "PPI" & x %in% Euler$PPI  & y %in% Euler$PPI, 1,0)) %>% 
  mutate(NCIPPI_DIS = ifelse(graph == "PPI & NCI" & x %in% Euler$PPINCI  & y %in% Euler$PPINCI, 1,0)) %>%
  filter(NCIPPI_DIS == 1 | PPI_DIS == 1) %>% 
  filter(Sab < 0) %>%
  group_by(graph) %>% 
  filter(Sab_norm > 0.07) %>% 
  ungroup() %>% 
  filter(Jaccard.Index > 0.05) %>%
  filter(Sab_p < sabp) %>%
  filter(Jac_padj < overlapp) %>% 
  filter(x %in% nodes_sab_alt$disease) %>%
  filter(y %in% nodes_sab_alt$disease)

Figure 5B: RA comorbidities Expanded

##############################
### Create the full SAB network
##############################

g_sabs_alt = graph_from_data_frame(Sabs_Alternative,
                                   directed = F, 
                                   nodes_sab_alt) %>%
  delete.vertices(., degree(.) == 0 )
vs = V(g_sabs_alt)

values <- lapply(1:length(vs), function(x) c(V(g_sabs_alt)$PPI_sig[x], V(g_sabs_alt)$PPI_NCI_sig[x]))

V(g_sabs_alt)$frame.color = NA
E(g_sabs_alt)$color = ifelse(E(g_sabs_alt)$graph == "PPI & NCI", "#ABE4ED", "#B998AE")
E(g_sabs_alt)$width = abs(E(g_sabs_alt)$Sab) * 2
E(g_sabs_alt)$weight = abs(E(g_sabs_alt)$Sab)^ 2
V(g_sabs_alt)$size = degree(g_sabs_alt) %>%
  CoDiNA::normalize()
V(g_sabs_alt)$size = (V(g_sabs_alt)$size +0.5)*5

##############################
### Filter for RA
##############################
diseases_2 = g_sabs_alt %>% 
  neighborhood(., nodes = "arthritis rheumatoid")
diseases_2
[[1]]
+ 32/493 vertices, named, from c938630:
 [1] arthritis rheumatoid          colonic neoplasms            
 [3] lung neoplasms                atherosclerosis              
 [5] carcinoma non small cell lung inflammatory bowel diseases  
 [7] neoplasms                     inflammation                 
 [9] melanoma                      diabetes mellitus type 1     
[11] diabetes mellitus type 2      adenocarcinoma               
[13] ovarian neoplasms             pancreatic neoplasms         
[15] stroke                        osteoarthritis               
[17] heart failure                 carcinoma renal cell         
[19] urinary bladder neoplasms     glioma                       
+ ... omitted several vertices
g_dis_zoom = g_sabs_alt %>% 
  delete.vertices(., V(.)$name %ni% names(unlist(diseases_2)))

V(g_dis_zoom)$label = V(g_dis_zoom)$name

vs = V(g_dis_zoom)
values <- lapply(1:length(vs), function(x) c(V(g_dis_zoom)$PPI_sig[x], V(g_dis_zoom)$PPI_NCI_sig[x]))

V(g_dis_zoom)$size = degree(g_dis_zoom) %>%
  CoDiNA::normalize()
V(g_dis_zoom)$size = (V(g_dis_zoom)$size +0.5)*8

Cairo::CairoPDF("../figs/Fig_05B.pdf", 
                width = 15, height = 10)
par(mfrow = c(1,2), 
    mar = c(1, 5, 1, 1))
ll = g_dis_zoom %>% 
  layout_with_fr(., weights = abs(E(.)$Sab) * 10, 
                 start.temp = 10) %>%
  igraph::norm_coords()

V(g_dis_zoom)$label.color = "#686868"
V(g_dis_zoom)$label = V(g_dis_zoom)$name

vs = V(g_dis_zoom)
values <- lapply(1:length(vs), function(x) c(V(g_dis_zoom)$PPI_sig[x], V(g_dis_zoom)$PPI_NCI_sig[x]))

E(g_dis_zoom)[!("arthritis rheumatoid" %--% 1:1000)]$color = adjustcolor("gray75", alpha.f = 0.3)

g_dis_zoom %>% 
  delete_edges(., E(.)[E(.)$PPI_DIS == 0])%>%
  plot(, 
       vertex.shape="pie", 
       vertex.label.dist=1,
       vertex.pie=values,
       edge.curved = 0.1, 
       edge.width = abs(E(.)$Sab) * 15, 
       layout = ll,
       vertex.pie.color=list(PPI_colors))
title("PPI")

g_dis_zoom %>% 
  delete_edges(., E(.)[E(.)$NCIPPI_DIS == 0])%>%
  plot(, 
       vertex.shape="pie", 
       vertex.label.dist=1,
       vertex.pie=values,
       edge.curved = 0.1, 
       edge.width = abs(E(.)$Sab) * 10, 
       layout = ll,
       vertex.pie.color=list(PPI_colors))
title("PPI & NCI")

dev.off()
quartz_off_screen 
                2 

Figure 5C: Full Comorbidities Map

g_sabs_alt %>% 
  delete_edges(., E(.)[E(.)$PPI_DIS == 1]) %>%
  delete.vertices(., degree(.) == 0)
IGRAPH bdf25ae UNW- 449 2586 -- 
+ attr: name (v/c), rLCCPPI (v/n), rLCCPPI & NCI (v/n), PPI_sig (v/n),
| PPI_NCI_sig (v/n), value (v/c), n (v/n), DiseaseClass (v/c), color
| (v/c), frame.color (v/l), size (v/n), graph (e/c), Jaccard.Index
| (e/n), Sab (e/n), Sab_p (e/n), Sab_norm (e/n), Jac_p (e/n), co_12
| (e/n), prev_1 (e/n), prev_2 (e/n), total (e/n), RR (e/n), lwr_ci_12
| (e/n), upr_ci_12 (e/n), como_p (e/n), rel_risk_21 (e/n), lwr_ci_21
| (e/n), upr_ci_21 (e/n), p_21 (e/n), percent (e/n), como_padj (e/n),
| Sab_padj (e/n), Jac_padj (e/n), sign_sab_jac_sabn (e/c), sign_sab_p
| (e/c), sign_sab (e/c), sign_jac (e/c), group (e/c), sign_como (e/c),
| sign_sab_jac (e/c), PPI_DIS (e/n), NCIPPI_DIS (e/n), color (e/c),
| width (e/n), weight (e/n)
+ edges from bdf25ae (vertex names):
g_sabs_alt %>% 
  delete_edges(., E(.)[E(.)$PPI_DIS == 1]) %>%
  extract_LCC()
IGRAPH 601648e UNW- 394 2543 -- 
+ attr: name (v/c), rLCCPPI (v/n), rLCCPPI & NCI (v/n), PPI_sig (v/n),
| PPI_NCI_sig (v/n), value (v/c), n (v/n), DiseaseClass (v/c), color
| (v/c), frame.color (v/l), size (v/n), graph (e/c), Jaccard.Index
| (e/n), Sab (e/n), Sab_p (e/n), Sab_norm (e/n), Jac_p (e/n), co_12
| (e/n), prev_1 (e/n), prev_2 (e/n), total (e/n), RR (e/n), lwr_ci_12
| (e/n), upr_ci_12 (e/n), como_p (e/n), rel_risk_21 (e/n), lwr_ci_21
| (e/n), upr_ci_21 (e/n), p_21 (e/n), percent (e/n), como_padj (e/n),
| Sab_padj (e/n), Jac_padj (e/n), sign_sab_jac_sabn (e/c), sign_sab_p
| (e/c), sign_sab (e/c), sign_jac (e/c), group (e/c), sign_como (e/c),
| sign_sab_jac (e/c), PPI_DIS (e/n), NCIPPI_DIS (e/n), color (e/c),
| width (e/n), weight (e/n)
+ edges from 601648e (vertex names):
g_sabs_alt %>% 
  delete_edges(., E(.)[E(.)$PPI_DIS == 0])%>%
  delete.vertices(., degree(.) == 0)
IGRAPH 6443e23 UNW- 338 532 -- 
+ attr: name (v/c), rLCCPPI (v/n), rLCCPPI & NCI (v/n), PPI_sig (v/n),
| PPI_NCI_sig (v/n), value (v/c), n (v/n), DiseaseClass (v/c), color
| (v/c), frame.color (v/l), size (v/n), graph (e/c), Jaccard.Index
| (e/n), Sab (e/n), Sab_p (e/n), Sab_norm (e/n), Jac_p (e/n), co_12
| (e/n), prev_1 (e/n), prev_2 (e/n), total (e/n), RR (e/n), lwr_ci_12
| (e/n), upr_ci_12 (e/n), como_p (e/n), rel_risk_21 (e/n), lwr_ci_21
| (e/n), upr_ci_21 (e/n), p_21 (e/n), percent (e/n), como_padj (e/n),
| Sab_padj (e/n), Jac_padj (e/n), sign_sab_jac_sabn (e/c), sign_sab_p
| (e/c), sign_sab (e/c), sign_jac (e/c), group (e/c), sign_como (e/c),
| sign_sab_jac (e/c), PPI_DIS (e/n), NCIPPI_DIS (e/n), color (e/c),
| width (e/n), weight (e/n)
+ edges from 6443e23 (vertex names):
g_sabs_alt %>% 
  delete_edges(., E(.)[E(.)$PPI_DIS == 0]) %>% 
  extract_LCC()
IGRAPH 2755fb3 UNW- 248 463 -- 
+ attr: name (v/c), rLCCPPI (v/n), rLCCPPI & NCI (v/n), PPI_sig (v/n),
| PPI_NCI_sig (v/n), value (v/c), n (v/n), DiseaseClass (v/c), color
| (v/c), frame.color (v/l), size (v/n), graph (e/c), Jaccard.Index
| (e/n), Sab (e/n), Sab_p (e/n), Sab_norm (e/n), Jac_p (e/n), co_12
| (e/n), prev_1 (e/n), prev_2 (e/n), total (e/n), RR (e/n), lwr_ci_12
| (e/n), upr_ci_12 (e/n), como_p (e/n), rel_risk_21 (e/n), lwr_ci_21
| (e/n), upr_ci_21 (e/n), p_21 (e/n), percent (e/n), como_padj (e/n),
| Sab_padj (e/n), Jac_padj (e/n), sign_sab_jac_sabn (e/c), sign_sab_p
| (e/c), sign_sab (e/c), sign_jac (e/c), group (e/c), sign_como (e/c),
| sign_sab_jac (e/c), PPI_DIS (e/n), NCIPPI_DIS (e/n), color (e/c),
| width (e/n), weight (e/n)
+ edges from 2755fb3 (vertex names):
Cairo::CairoPDF("../figs/Fig_05C.pdf",
                width = 18, height = 10)

par(mfrow = c(1,2), 
    mar = c(0, 0, 5, 0))
ll = g_sabs_alt %>% 
  layout_with_fr(., weights = abs(E(.)$Sab_norm) *3) %>%
  igraph::norm_coords()

V(g_sabs_alt)$label.color = "#686868"
V(g_sabs_alt)$label = ifelse(V(g_sabs_alt)$name %in% c(Interest), V(g_sabs_alt)$name, NA)
g_sabs_alt %>% 
  delete_edges(., E(.)[E(.)$PPI_DIS == 0])%>%
  plot(,
       edge.curved = 0.1,
       vertex.frame.color = V(.)$color,
       vertex.color = ifelse(V(.)$PPI_sig == 0, "white", V(.)$color),
       layout = ll, 
       vertex.size = ifelse(is.na(V(.)$rLCCPPI), 0.5,V(.)$rLCCPPI)*2.5)
title("PPI")

g_sabs_alt %>% 
  delete_edges(., E(.)[E(.)$PPI_DIS != 0])%>%
  plot(,
       edge.curved = 0.1,
       vertex.frame.color = V(.)$color,
       vertex.color = ifelse(V(.)$PPI_NCI_sig == 0, "white", V(.)$color),
       layout = ll, 
       vertex.size = ifelse(is.na(V(.)$`rLCCPPI & NCI`), 0.5,V(.)$`rLCCPPI & NCI`)*2.5)
title("PPI & NCI")

dev.off()
quartz_off_screen 
                2